diff --git a/Cargo.toml b/Cargo.toml index 1563b4c..3c158aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,5 +13,6 @@ linked-hash-map = "0.5.6" nom = "7.1.3" postgres = { version = "0.19.4", features = ["with-chrono-0_4"] } pretty_assertions = "1.3.0" +regex = "1.7.1" serde = { version = "1.0.152", features = ["derive"] } serde_json = "1.0.93" diff --git a/src/data/interface.rs b/src/data/interface.rs index 10a0442..8073702 100644 --- a/src/data/interface.rs +++ b/src/data/interface.rs @@ -15,7 +15,7 @@ use crate::static_data::Assembly; /// aliases | AT1,ATA,ATC,ATD,ATE,ATDC,TEL1,TELO1 /// added | 2014-02-04 21:39:32.57125 /// ``` -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Default, Clone)] pub struct GeneInfoRecord { pub hgnc: String, pub maploc: String, @@ -44,7 +44,7 @@ pub struct GeneInfoRecord { /// structure means that the transcripts are defined on the same /// reference sequence and have the same exon spans on that /// sequence. -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Default, Clone)] pub struct TxSimilarityRecord { /// Accession of first transcript. pub tx_ac1: String, @@ -81,7 +81,7 @@ pub struct TxSimilarityRecord { /// alt_exon_id | 6063334 /// exon_aln_id | 3461425 ///``` -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Default, Clone)] pub struct TxExonsRecord { pub hgnc: String, pub tx_ac: String, @@ -111,7 +111,7 @@ pub struct TxExonsRecord { /// start_i | 95226307 /// end_i | 95248406 /// ``` -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Default, Clone)] pub struct TxForRegionRecord { pub tx_ac: String, pub alt_ac: String, @@ -130,7 +130,7 @@ pub struct TxForRegionRecord { /// lengths | {707,79,410} /// hgnc | VSX1 /// ``` -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Default, Clone)] pub struct TxIdentityInfo { pub tx_ac: String, pub alt_ac: String, @@ -149,7 +149,7 @@ pub struct TxIdentityInfo { /// alt_ac | AC_000143.1 /// alt_aln_method | splign /// ``` -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Default, Clone)] pub struct TxInfoRecord { pub hgnc: String, pub cds_start_i: Option, @@ -169,14 +169,15 @@ pub struct TxInfoRecord { /// alt_ac | NC_000012.11 /// alt_aln_method | genebuild /// ``` -#[derive(Debug, PartialEq)] +#[derive(Debug, PartialEq, Default, Clone)] pub struct TxMappingOptionsRecord { pub tx_ac: String, pub alt_ac: String, pub alt_aln_method: String, } -pub trait Interface { +/// Interface for data providers. +pub trait Provider { /// Return the data version, e.g., `uta_20180821`. fn data_version(&self) -> &str; @@ -198,7 +199,7 @@ pub trait Interface { /// # Arguments /// /// * `hgnc` - HGNC gene name - fn get_gene_info(&mut self, hgnc: &str) -> Result; + fn get_gene_info(&self, hgnc: &str) -> Result; /// Return the (single) associated protein accession for a given transcript accession, /// or None if not found. @@ -206,14 +207,14 @@ pub trait Interface { /// # Arguments /// /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3') - fn get_pro_ac_for_tx_ac(&mut self, tx_ac: &str) -> Result, anyhow::Error>; + fn get_pro_ac_for_tx_ac(&self, tx_ac: &str) -> Result, anyhow::Error>; /// Return full sequence for the given accession. /// /// # Arguments /// /// * `ac` -- accession - fn get_seq(&mut self, ac: &str) -> Result; + fn get_seq(&self, ac: &str) -> Result; /// Return sequence part for the given accession. /// @@ -223,7 +224,7 @@ pub trait Interface { /// * `start` -- start position (0-based, start of sequence if missing) /// * `end` -- end position (0-based, end of sequence if missing) fn get_seq_part( - &mut self, + &self, ac: &str, begin: Option, end: Option, @@ -236,7 +237,7 @@ pub trait Interface { /// /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3') fn get_similar_transcripts( - &mut self, + &self, tx_ac: &str, ) -> Result, anyhow::Error>; @@ -249,7 +250,7 @@ pub trait Interface { /// * `alt_ac` -- specific genomic sequence (e.g., NC_000011.4) /// * `alt_aln_method` -- sequence alignment method (e.g., splign, blat) fn get_tx_exons( - &mut self, + &self, tx_ac: &str, alt_ac: &str, alt_aln_method: &str, @@ -260,7 +261,7 @@ pub trait Interface { /// # Arguments /// /// * `gene` - HGNC gene name - fn get_tx_for_gene(&mut self, gene: &str) -> Result, anyhow::Error>; + fn get_tx_for_gene(&self, gene: &str) -> Result, anyhow::Error>; /// Return transcripts that overlap given region. /// @@ -271,7 +272,7 @@ pub trait Interface { // * `start_i` -- 5' bound of region // * `end_i` -- 3' bound of region fn get_tx_for_region( - &mut self, + &self, alt_ac: &str, alt_aln_method: &str, start_i: i32, @@ -283,7 +284,7 @@ pub trait Interface { /// # Arguments /// /// * `tx_ac` -- transcript accession with version (e.g., 'NM_199425.2') - fn get_tx_identity_info(&mut self, tx_ac: &str) -> Result; + fn get_tx_identity_info(&self, tx_ac: &str) -> Result; /// Return a single transcript info for supplied accession (tx_ac, alt_ac, alt_aln_method), or None if not found. /// @@ -293,7 +294,7 @@ pub trait Interface { /// * `alt_ac -- specific genomic sequence (e.g., NC_000011.4) /// * `alt_aln_method` -- sequence alignment method (e.g., splign, blat) fn get_tx_info( - &mut self, + &self, tx_ac: &str, alt_ac: &str, alt_aln_method: &str, @@ -308,7 +309,7 @@ pub trait Interface { /// /// * `tx_ac` -- transcript accession with version (e.g., 'NM_000051.3') fn get_tx_mapping_options( - &mut self, + &self, tax_ac: &str, ) -> Result, anyhow::Error>; } diff --git a/src/data/mod.rs b/src/data/mod.rs index 4a1d104..da2d2ba 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -1,6 +1,3 @@ ///! Datatypes, interfaces, and data acess. -mod interface; -mod uta; - -pub use interface::*; -pub use uta::*; +pub mod interface; +pub mod uta; diff --git a/src/data/uta.rs b/src/data/uta.rs index c0ddf41..f6c0a9e 100644 --- a/src/data/uta.rs +++ b/src/data/uta.rs @@ -5,12 +5,14 @@ use linked_hash_map::LinkedHashMap; use postgres::{Client, NoTls, Row}; use std::fmt::Debug; +use std::sync::Mutex; use crate::static_data::{Assembly, ASSEMBLY_INFOS}; -use super::{ - GeneInfoRecord, Interface, TxExonsRecord, TxForRegionRecord, TxIdentityInfo, TxInfoRecord, - TxMappingOptionsRecord, TxSimilarityRecord, +use crate::data::{ + interface::GeneInfoRecord, interface::Provider as ProviderInterface, interface::TxExonsRecord, + interface::TxForRegionRecord, interface::TxIdentityInfo, interface::TxInfoRecord, + interface::TxMappingOptionsRecord, interface::TxSimilarityRecord, }; /// Configurationf or the `data::uta::Provider`. @@ -157,7 +159,7 @@ pub struct Provider { /// Configuration for the access. config: Config, /// Connection to the postgres database. - conn: Client, + conn: Mutex, /// The schema version, set on creation. schema_version: String, } @@ -174,8 +176,9 @@ impl Debug for Provider { impl Provider { pub fn with_config(config: &Config) -> Result { let config = config.clone(); - let mut conn = Client::connect(&config.db_url, NoTls)?; - let schema_version = Self::fetch_schema_version(&mut conn, &config.db_schema)?; + let conn = Mutex::new(Client::connect(&config.db_url, NoTls)?); + let schema_version = + Self::fetch_schema_version(&mut conn.lock().unwrap(), &config.db_schema)?; Ok(Self { config, conn, @@ -190,7 +193,7 @@ impl Provider { } } -impl Interface for Provider { +impl ProviderInterface for Provider { fn data_version(&self) -> &str { &self.config.db_schema } @@ -208,32 +211,39 @@ impl Interface for Provider { ) } - fn get_gene_info(&mut self, hgnc: &str) -> Result { + fn get_gene_info(&self, hgnc: &str) -> Result { let sql = format!( "SELECT * FROM {}.gene WHERE hgnc = $1", self.config.db_schema ); - self.conn.query_one(&sql, &[&hgnc])?.try_into() + self.conn + .lock() + .unwrap() + .query_one(&sql, &[&hgnc])? + .try_into() } - fn get_pro_ac_for_tx_ac(&mut self, tx_ac: &str) -> Result, anyhow::Error> { + fn get_pro_ac_for_tx_ac(&self, tx_ac: &str) -> Result, anyhow::Error> { let sql = format!( "SELECT pro_ac FROM {}.associated_accessions \ WHERE tx_ac = $1 ORDER BY pro_ac DESC", self.config.db_schema ); - if let Some(row) = (self.conn.query(&sql, &[&tx_ac])?).into_iter().next() { + if let Some(row) = (self.conn.lock().unwrap().query(&sql, &[&tx_ac])?) + .into_iter() + .next() + { return Ok(Some(row.try_get("pro_ac")?)); } Ok(None) } - fn get_seq(&mut self, ac: &str) -> Result { + fn get_seq(&self, ac: &str) -> Result { self.get_seq_part(ac, None, None) } fn get_seq_part( - &mut self, + &self, ac: &str, begin: Option, end: Option, @@ -242,13 +252,23 @@ impl Interface for Provider { "SELECT seq_id FROM {}.seq_anno WHERE ac = $1", self.config.db_schema ); - let seq_id: String = self.conn.query_one(&sql, &[&ac])?.try_get("seq_id")?; + let seq_id: String = self + .conn + .lock() + .unwrap() + .query_one(&sql, &[&ac])? + .try_get("seq_id")?; let sql = format!( "SELECT seq FROM {}.seq WHERE seq_id = $1", self.config.db_schema ); - let seq: String = self.conn.query_one(&sql, &[&seq_id])?.try_get("seq")?; + let seq: String = self + .conn + .lock() + .unwrap() + .query_one(&sql, &[&seq_id])? + .try_get("seq")?; let begin = begin.unwrap_or_default(); let end = end @@ -258,9 +278,9 @@ impl Interface for Provider { } fn get_similar_transcripts( - &mut self, + &self, tx_ac: &str, - ) -> Result, anyhow::Error> { + ) -> Result, anyhow::Error> { let sql = format!( "SELECT * FROM {}.tx_similarity_v \ WHERE tx_ac1 = $1 \ @@ -268,27 +288,31 @@ impl Interface for Provider { self.config.db_schema ); let mut result = Vec::new(); - for row in self.conn.query(&sql, &[&tx_ac])? { + for row in self.conn.lock().unwrap().query(&sql, &[&tx_ac])? { result.push(row.try_into()?); } Ok(result) } fn get_tx_exons( - &mut self, + &self, tx_ac: &str, alt_ac: &str, alt_aln_method: &str, - ) -> Result, anyhow::Error> { + ) -> Result, anyhow::Error> { let sql = format!( "SELECT * FROM {}.tx_exon_aln_v \ WHERE tx_ac = $1 AND alt_ac = $2 and alt_aln_method = $3 \ - ORDER BY tx_start_i, tx_end_i, alt_start_i, alt_end_i, \ - tx_exon_set_id, alt_exon_set_id", + ORDER BY alt_start_i", self.config.db_schema ); let mut result = Vec::new(); - for row in self.conn.query(&sql, &[&tx_ac, &alt_ac, &alt_aln_method])? { + for row in self + .conn + .lock() + .unwrap() + .query(&sql, &[&tx_ac, &alt_ac, &alt_aln_method])? + { result.push(row.try_into()?); } if result.is_empty() { @@ -303,7 +327,7 @@ impl Interface for Provider { } } - fn get_tx_for_gene(&mut self, gene: &str) -> Result, anyhow::Error> { + fn get_tx_for_gene(&self, gene: &str) -> Result, anyhow::Error> { let sql = format!( "SELECT hgnc, cds_start_i, cds_end_i, tx_ac, alt_ac, alt_aln_method \ FROM {}.transcript T \ @@ -313,14 +337,14 @@ impl Interface for Provider { self.config.db_schema, self.config.db_schema, ); let mut result = Vec::new(); - for row in self.conn.query(&sql, &[&gene])? { + for row in self.conn.lock().unwrap().query(&sql, &[&gene])? { result.push(row.try_into()?); } Ok(result) } fn get_tx_for_region( - &mut self, + &self, alt_ac: &str, alt_aln_method: &str, start_i: i32, @@ -338,7 +362,12 @@ impl Interface for Provider { self.config.db_schema, self.config.db_schema, ); let mut result = Vec::new(); - for row in self.conn.query(&sql, &[&alt_ac, &start_i, &end_i])? { + for row in self + .conn + .lock() + .unwrap() + .query(&sql, &[&alt_ac, &start_i, &end_i])? + { let record: TxForRegionRecord = row.try_into()?; // NB: The original Python code did not use alt_aln_method in the query either. if record.alt_aln_method == alt_aln_method { @@ -348,7 +377,7 @@ impl Interface for Provider { Ok(result) } - fn get_tx_identity_info(&mut self, tx_ac: &str) -> Result { + fn get_tx_identity_info(&self, tx_ac: &str) -> Result { let sql = format!( "SELECT DISTINCT(tx_ac), alt_ac, alt_aln_method, cds_start_i, \ cds_end_i, lengths, hgnc \ @@ -357,11 +386,15 @@ impl Interface for Provider { ORDER BY tx_ac, alt_ac, alt_aln_method, cds_start_i, cds_end_i, lengths, hgnc", self.config.db_schema ); - self.conn.query_one(&sql, &[&tx_ac])?.try_into() + self.conn + .lock() + .unwrap() + .query_one(&sql, &[&tx_ac])? + .try_into() } fn get_tx_info( - &mut self, + &self, tx_ac: &str, alt_ac: &str, alt_aln_method: &str, @@ -375,12 +408,14 @@ impl Interface for Provider { self.config.db_schema, self.config.db_schema, ); self.conn + .lock() + .unwrap() .query_one(&sql, &[&tx_ac, &alt_ac, &alt_aln_method])? .try_into() } fn get_tx_mapping_options( - &mut self, + &self, tx_ac: &str, ) -> Result, anyhow::Error> { let sql = format!( @@ -391,7 +426,7 @@ impl Interface for Provider { self.config.db_schema ); let mut result = Vec::new(); - for row in self.conn.query(&sql, &[&tx_ac])? { + for row in self.conn.lock().unwrap().query(&sql, &[&tx_ac])? { result.push(row.try_into()?); } Ok(result) @@ -400,7 +435,7 @@ impl Interface for Provider { #[cfg(test)] mod test { - use crate::{data::Interface, static_data::Assembly}; + use crate::{data::interface::Provider as ProviderInterface, static_data::Assembly}; use super::{Config, Provider}; @@ -441,7 +476,7 @@ mod test { #[test] fn get_gene_info() -> Result<(), anyhow::Error> { - let mut provider = Provider::with_config(&get_config())?; + let provider = Provider::with_config(&get_config())?; assert_eq!( format!("{:?}", provider.get_gene_info("OMA1")?), @@ -456,7 +491,7 @@ mod test { #[test] fn get_pro_ac_for_tx_ac() -> Result<(), anyhow::Error> { - let mut provider = Provider::with_config(&get_config())?; + let provider = Provider::with_config(&get_config())?; assert_eq!( provider.get_pro_ac_for_tx_ac("NM_130831.2")?, @@ -469,7 +504,7 @@ mod test { #[test] fn get_seq() -> Result<(), anyhow::Error> { - let mut provider = Provider::with_config(&get_config())?; + let provider = Provider::with_config(&get_config())?; assert_eq!(provider.get_seq("NM_001354664.1")?.len(), 6386); @@ -478,7 +513,7 @@ mod test { #[test] fn get_seq_part() -> Result<(), anyhow::Error> { - let mut provider = Provider::with_config(&get_config())?; + let provider = Provider::with_config(&get_config())?; assert_eq!( provider @@ -492,7 +527,7 @@ mod test { #[test] fn get_similar_transcripts() -> Result<(), anyhow::Error> { - let mut provider = Provider::with_config(&get_config())?; + let provider = Provider::with_config(&get_config())?; let records = provider.get_similar_transcripts("NM_001354664.1")?; @@ -509,7 +544,7 @@ mod test { #[test] fn get_tx_exons() -> Result<(), anyhow::Error> { - let mut provider = Provider::with_config(&get_config())?; + let provider = Provider::with_config(&get_config())?; let records = provider.get_tx_exons("NM_001354664.1", "NC_000003.11", "splign")?; @@ -529,7 +564,7 @@ mod test { #[test] fn get_tx_for_gene() -> Result<(), anyhow::Error> { - let mut provider = Provider::with_config(&get_config())?; + let provider = Provider::with_config(&get_config())?; let records = provider.get_tx_for_gene("OMA1")?; @@ -546,7 +581,7 @@ mod test { #[test] fn get_tx_for_region() -> Result<(), anyhow::Error> { - let mut provider = Provider::with_config(&get_config())?; + let provider = Provider::with_config(&get_config())?; let records = provider.get_tx_for_region("NC_000001.10", "splign", 58946391, 59012446)?; @@ -563,7 +598,7 @@ mod test { #[test] fn get_tx_identity_info() -> Result<(), anyhow::Error> { - let mut provider = Provider::with_config(&get_config())?; + let provider = Provider::with_config(&get_config())?; let record = provider.get_tx_identity_info("ENST00000421528")?; @@ -579,7 +614,7 @@ mod test { #[test] fn get_tx_info() -> Result<(), anyhow::Error> { - let mut provider = Provider::with_config(&get_config())?; + let provider = Provider::with_config(&get_config())?; let record = provider.get_tx_info("ENST00000421528", "NC_000001.10", "genebuild")?; @@ -595,7 +630,7 @@ mod test { #[test] fn get_tx_mapping_options() -> Result<(), anyhow::Error> { - let mut provider = Provider::with_config(&get_config())?; + let provider = Provider::with_config(&get_config())?; let records = provider.get_tx_mapping_options("ENST00000421528")?; diff --git a/src/lib.rs b/src/lib.rs index 1f47177..917d65c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,8 @@ pub mod data; +pub mod mapper; pub mod parser; pub mod static_data; +pub mod validator; pub fn add(left: usize, right: usize) -> usize { left + right diff --git a/src/mapper/alignment.rs b/src/mapper/alignment.rs new file mode 100644 index 0000000..23cc4a4 --- /dev/null +++ b/src/mapper/alignment.rs @@ -0,0 +1,1028 @@ +//! Mapping positions between pairs of sequence alignments. +//! +//! `AlignmentMapper` is at the heart of mapping between aligned sequences. + +// Implementation note re: "no-zero correction": HGVS doesn't have a +// 0. Counting is -3, -2, -1, 1, 2, 3 :-/ Coordinate calculations must +// take this discontinuity in c. positions into account. The +// implementation of imaginary transcript positions creates a second +// discontinuity. (By analogy with c., n.0 is declared to not exist.) +// The strategy used in this code is to use internal c0 and n0 +// coordinates, which include 0, for coordinate calculations and to +// translate these to c. and n. positions as needed. +// +// imag. imag. +// upstream 5' UTR CDS 3' UTR downstr +// |> +// - - - - - - ———————————— ||||||||||||||||| ——————————— - - - - - - +// a b C D E f g h i +// c. -4 -3 -2 -1 ! 1 2 3 ! *1 *2 *3 *4 +// c0 -4 -3 -2 -1 0 1 2 3 4 5 6 +// n0 -2 -1 0 1 2 3 4 5 6 7 8 +// n. -2 -1 ! 1 2 3 4 5 6 7 8 9 +// g. ... 123 124 125 126 127 128 129 130 131 132 133 ... + +use std::rc::Rc; + +use crate::{ + data::interface::{Provider, TxExonsRecord}, + parser::{CdsInterval, CdsPos, GenomeInterval, Mu, TxInterval, TxPos}, +}; + +use super::cigar::{ + parse_cigar_string, CigarElement, CigarMapper, CigarMapperResult, CigarOp, CigarString, +}; + +/// Convert zero-based coordinate to hgvs (1 based, missing zero) +fn zbc_to_hgvs(i: i32) -> i32 { + if i >= 0 { + i + 1 + } else { + i + } +} + +/// Convert hgvs (1 based, missing zero) +fn hgvs_to_zbc(i: i32) -> i32 { + if i >= 1 { + i - 1 + } else { + i + } +} + +/// Builds a single CIGAR string representing an alignment of the transcript sequence to a +/// reference sequence, including introns. +/// +/// The input exons are expected to be in transcript order, and the resulting CIGAR is also +/// in transcript order. +pub fn build_tx_cigar( + exons: &Vec, + strand: i16, +) -> Result { + if exons.is_empty() { + return Err(anyhow::anyhow!( + "Cannot build CIGAR string from empty exons" + )); + } + + // Parse CIGAR string and flip if on reverse strand. + let exon_cigars: Result, anyhow::Error> = exons + .iter() + .map(|record| { + let mut cigar = parse_cigar_string(&record.cigar)?; + if strand == -1 { + cigar.reverse(); + } + Ok(cigar) + }) + .collect(); + let exon_cigars = exon_cigars?; + + let mut result = exon_cigars[0].clone(); // exon 1 + for i in 1..exon_cigars.len() { + result.push(CigarElement { + count: exons[i].alt_start_i - exons[i - 1].alt_end_i, + op: CigarOp::Skip, + }); + result.append(&mut exon_cigars[i].clone()); + } + Ok(result) +} + +/// Helper function that wraps a value into `Option` but returns `None` for the default value. +pub fn none_if_default(value: T) -> Option +where + T: Default + PartialEq, +{ + if value == T::default() { + None + } else { + Some(value) + } +} + +/// Configuration for mapping. +pub struct Config { + /// Require transcript variants to be within transcript sequence bounds. + pub strict_bounds: bool, +} + +impl Default for Config { + fn default() -> Self { + Self { + strict_bounds: true, + } + } +} + +/// Map HGVS location objects between genomic (g), non-coding (n) and cds (c) +/// coordinates according to a CIGAR string. +pub struct AlignmentMapper { + /// Configuration for alignment mapping. + pub config: Config, + /// Data provider to use for the mapping. + pub provider: Rc, + + /// The transcript accession. + pub tx_ac: String, + /// The reference sequence asccession. + pub alt_ac: String, + /// The alignment method. + pub alt_aln_method: String, + pub strand: i16, + pub gc_offset: i32, + pub cds_start_i: Option, + pub cds_end_i: Option, + pub tgt_len: i32, + pub cigar_mapper: CigarMapper, +} + +impl AlignmentMapper { + pub fn new( + provider: Rc, + tx_ac: &str, + alt_ac: &str, + alt_aln_method: &str, + ) -> Result { + let (strand, gc_offset, cds_start_i, cds_end_i, tgt_len, cigar_mapper) = + if alt_aln_method != "transcript" { + let tx_info = provider.get_tx_info(tx_ac, alt_ac, alt_aln_method)?; + let tx_exons = { + let tx_exons = provider.get_tx_exons(tx_ac, alt_ac, alt_aln_method)?; + if tx_exons.is_empty() { + return Err(anyhow::anyhow!( + "Found no exons for tx_ac={}, alt_ac={}, alt_aln_method={}", + tx_ac, + alt_ac, + alt_aln_method + )); + } + + // Issue biocommons/hgvs#386: An assumption when building the CIGAR string is that + // exons are adjacent. Assert that here. + let mut sorted_exons = tx_exons.clone(); + sorted_exons.sort_by(|a, b| a.ord.partial_cmp(&b.ord).unwrap()); + let offenders = sorted_exons + .windows(2) + .filter(|pair| { + let lhs = &pair[0]; + let rhs = &pair[1]; + lhs.tx_end_i != rhs.tx_start_i + }) + .collect::>(); + if !offenders.is_empty() { + return Err(anyhow::anyhow!( + "Non-adjacent exons for tx_acc={}, alt_acc={}, alt_aln_method={}: {:?}", + tx_ac, + alt_ac, + alt_aln_method, + &offenders + )); + } + + tx_exons + }; + + let strand = tx_exons[0].alt_strand; + let gc_offset = tx_exons[0].alt_start_i; + let cds_start_i = tx_info.cds_start_i; + let cds_end_i = tx_info.cds_end_i; + + if cds_start_i.is_none() != cds_end_i.is_none() { + return Err(anyhow::anyhow!( + "CDS start and end must both be defined or undefined" + )); + } + + let cigar_mapper = CigarMapper::new(&build_tx_cigar(&tx_exons, strand)?); + let tgt_len = cigar_mapper.tgt_len; + + ( + strand, + gc_offset, + cds_start_i, + cds_end_i, + tgt_len, + cigar_mapper, + ) + } else { + // this covers the identity cases n <-> c + let tx_identity_info = provider.get_tx_identity_info(tx_ac)?; + + let cds_start_i = tx_identity_info.cds_start_i; + let cds_end_i = tx_identity_info.cds_end_i; + let tgt_len: i32 = tx_identity_info.lengths.iter().sum(); + + ( + 1, // strand + 0, // gc_offset + Some(cds_start_i), + Some(cds_end_i), + tgt_len, + Default::default(), + ) + }; + + if cds_start_i.is_none() != cds_end_i.is_none() { + return Err(anyhow::anyhow!( + "CDs start and end position but be both or neither defined." + )); + } + + Ok(AlignmentMapper { + config: Default::default(), + provider, + tx_ac: tx_ac.to_string(), + alt_ac: alt_ac.to_string(), + alt_aln_method: alt_aln_method.to_string(), + cds_start_i, + cds_end_i, + tgt_len, + cigar_mapper, + strand, + gc_offset, + }) + } + + /// Convert a genomic (g.) interval to a transcript (n.) interval. + pub fn g_to_n( + &self, + g_interval: &GenomeInterval, + _strict_bounds: bool, + ) -> Result, anyhow::Error> { + if let GenomeInterval { + start: Some(begin), + end: Some(end), + } = g_interval + { + let grs = begin - 1 - self.gc_offset; + let gre = end - 1 - self.gc_offset; + + // Compute start/end on forward strand with respect to the genome. + // + // frs, fre = (f)orward (r)na (s)tart & (e)nd + let frs = self + .cigar_mapper + .map_ref_to_tgt(grs, "start", self.config.strict_bounds)?; + let fre = self + .cigar_mapper + .map_ref_to_tgt(gre, "end", self.config.strict_bounds)?; + + // Project to reverse strand if necessary. + let (frs, fre) = if self.strand == -1 { + ( + CigarMapperResult { + pos: self.tgt_len - 1 - fre.pos, + offset: -fre.offset, + cigar_op: fre.cigar_op, + }, + CigarMapperResult { + pos: self.tgt_len - 1 - frs.pos, + offset: -frs.offset, + cigar_op: frs.cigar_op, + }, + ) + } else { + (frs, fre) + }; + + // The position is uncertain if the alignment ends in a gap. + let n_interval = TxInterval { + start: TxPos { + base: zbc_to_hgvs(frs.pos), + offset: none_if_default(frs.offset), + }, + end: TxPos { + base: zbc_to_hgvs(fre.pos), + offset: none_if_default(fre.offset), + }, + }; + let result = if frs.cigar_op != CigarOp::Del + && frs.cigar_op != CigarOp::Ins + && fre.cigar_op != CigarOp::Del + && fre.cigar_op != CigarOp::Ins + { + Mu::Certain(n_interval) + } else { + Mu::Uncertain(n_interval) + }; + Ok(result) + } else { + Err(anyhow::anyhow!( + "Cannot project genome interval with missing start or end position: {}", + g_interval + )) + } + } + + /// Convert a transcript (n.) interval to a genomic (g.) interval. + pub fn n_to_g( + &self, + n_interval: &TxInterval, + strict_bounds: bool, + ) -> Result, anyhow::Error> { + let frs = hgvs_to_zbc(n_interval.start.base); + let start_offset = n_interval.start.offset.unwrap_or(0); + let fre = hgvs_to_zbc(n_interval.end.base); + let end_offset = n_interval.end.offset.unwrap_or(0); + + let (fre, frs, start_offset, end_offset) = if self.strand == -1 { + ( + self.tgt_len - 1 - frs, + self.tgt_len - 1 - fre, + -end_offset, + -start_offset, + ) + } else { + (fre, frs, start_offset, end_offset) + }; + + // Obtain the genomic range start (grs) and end (gre). + let grs = self + .cigar_mapper + .map_tgt_to_ref(frs, "start", strict_bounds)?; + let gre = self + .cigar_mapper + .map_tgt_to_ref(fre, "start", strict_bounds)?; + let (grs_pos, gre_pos) = (grs.pos + self.gc_offset + 1, gre.pos + self.gc_offset + 1); + let (gs, ge) = (grs_pos + start_offset, gre_pos + end_offset); + + // The returned interval would be uncertain when locating at alignment gaps. + Ok(Mu::from( + GenomeInterval { + start: Some(gs), + end: Some(ge), + }, + grs.cigar_op != CigarOp::Del + && grs.cigar_op != CigarOp::Ins + && gre.cigar_op != CigarOp::Del + && gre.cigar_op != CigarOp::Ins, + )) + } + + fn pos_n_to_c(&self, pos: &TxPos) -> CdsPos { + let cds_start_i = self.cds_start_i.unwrap(); + let cds_end_i = self.cds_end_i.unwrap(); + if pos.base <= cds_start_i { + CdsPos { + base: pos.base - cds_start_i - (if pos.base > 0 { 1 } else { 0 }), + offset: pos.offset, + cds_from: crate::parser::CdsFrom::Start, + } + } else if pos.base > cds_start_i && pos.base <= cds_end_i { + CdsPos { + base: pos.base - cds_start_i, + offset: pos.offset, + cds_from: crate::parser::CdsFrom::Start, + } + } else { + CdsPos { + base: pos.base - cds_end_i, + offset: pos.offset, + cds_from: crate::parser::CdsFrom::End, + } + } + } + + /// Convert a transcript (n.) interval to a CDS (c.) interval. + pub fn n_to_c( + &self, + n_interval: &TxInterval, + strict_bounds: bool, + ) -> Result { + if self.cds_start_i.is_none() { + return Err(anyhow::anyhow!( + "CDS is undefined for {}; cannot map to c. coordinate (non-coding transcript?)", + self.tx_ac + )); + } + + if strict_bounds && (n_interval.start.base <= 0 || n_interval.end.base > self.tgt_len) { + return Err(anyhow::anyhow!( + "The given coordinate is outside the bounds of the reference sequence." + )); + } + + Ok(CdsInterval { + start: self.pos_n_to_c(&n_interval.start), + end: self.pos_n_to_c(&n_interval.end), + }) + } + + pub fn pos_c_to_n(&self, pos: &CdsPos, strict_bounds: bool) -> Result { + let cds_start_i = self.cds_start_i.unwrap(); + let cds_end_i = self.cds_end_i.unwrap(); + + let n = match pos.cds_from { + crate::parser::CdsFrom::Start => { + let n = pos.base + cds_start_i; + if pos.base < 0 { + // correct for lack of c.0 coordinate + n + 1 + } else { + n + } + } + crate::parser::CdsFrom::End => pos.base + cds_end_i, + }; + + let n = if n <= 0 { + // correct for lack of n.0 coordinate + n - 1 + } else { + n + }; + + if (n <= 0 || n > self.tgt_len) && strict_bounds { + Err(anyhow::anyhow!("c.{:?} coordinate is out of boounds", pos)) + } else { + Ok(TxPos { + base: n, + offset: pos.offset, + }) + } + } + + /// Convert a a CDS (c.) interval to a transcript (n.) interval. + pub fn c_to_n( + &self, + c_interval: &CdsInterval, + strict_bounds: bool, + ) -> Result { + if self.cds_start_i.is_none() { + return Err(anyhow::anyhow!( + "CDS is undefined for {}; cannot map to c. coordinate (non-coding transcript?)", + self.tx_ac + )); + } + + let n_start = self.pos_c_to_n(&c_interval.start, strict_bounds); + let n_end = self.pos_c_to_n(&c_interval.end, strict_bounds); + Ok(TxInterval { + start: n_start?, + end: n_end?, + }) + } + + /// Convert a genomic (g.) interval to a CDS (c.) interval. + pub fn g_to_c( + &self, + g_interval: &GenomeInterval, + strict_bounds: bool, + ) -> Result, anyhow::Error> { + let n_interval = self.g_to_n(g_interval, strict_bounds)?; + Ok(match &n_interval { + Mu::Certain(n_interval) => Mu::Certain(self.n_to_c(n_interval, strict_bounds)?), + Mu::Uncertain(n_interval) => Mu::Uncertain(self.n_to_c(n_interval, strict_bounds)?), + }) + } + + /// Convert a CDS (c.) interval to a genomic (g.) interval. + pub fn c_to_g( + &self, + c_interval: &CdsInterval, + strict_bounds: bool, + ) -> Result, anyhow::Error> { + let n_interval = self.c_to_n(c_interval, strict_bounds)?; + let g_interval = self.n_to_g(&n_interval, strict_bounds)?; + Ok(g_interval) + } + + /// Return the transcript is coding. + pub fn is_coding_transcript(&self) -> bool { + self.cds_start_i.is_some() + } + + /// Return whether the given genome interval is in bounds. + pub fn is_g_interval_in_bounds(&self, g_interval: GenomeInterval) -> bool { + let grs = g_interval.start.unwrap() - 1 - self.gc_offset; + let gre = g_interval.end.unwrap() - 1 - self.gc_offset; + grs >= 0 && gre <= self.cigar_mapper.ref_len + } +} + +#[cfg(test)] +mod test { + use std::{rc::Rc, str::FromStr}; + + use pretty_assertions::assert_eq; + + use crate::{ + data::{ + interface::{Provider as Interface, TxExonsRecord}, + uta::{Config, Provider}, + }, + parser::{CdsFrom, CdsInterval, CdsPos, GenomeInterval, Mu, TxInterval, TxPos}, + }; + + use super::{build_tx_cigar, none_if_default, AlignmentMapper}; + + #[test] + fn build_tx_cigar_empty() { + assert!(build_tx_cigar(&Vec::new(), 1).is_err()); + } + + #[test] + fn build_tx_cigar_forward() -> Result<(), anyhow::Error> { + let exons = vec![ + TxExonsRecord { + tx_start_i: 0, + tx_end_i: 10, + alt_start_i: 100, + alt_end_i: 110, + cigar: "5M1I4M".to_string(), + ..Default::default() + }, + TxExonsRecord { + tx_start_i: 10, + tx_end_i: 21, + alt_start_i: 120, + alt_end_i: 131, + cigar: "7M1I2M".to_string(), + ..Default::default() + }, + ]; + + assert_eq!(format!("{}", &build_tx_cigar(&exons, 1)?), "5MI4M10N7MI2M"); + + Ok(()) + } + + #[test] + fn build_tx_cigar_reverse() -> Result<(), anyhow::Error> { + let exons = vec![ + TxExonsRecord { + tx_start_i: 0, + tx_end_i: 10, + alt_start_i: 100, + alt_end_i: 110, + cigar: "5M1I4M".to_string(), + ..Default::default() + }, + TxExonsRecord { + tx_start_i: 10, + tx_end_i: 21, + alt_start_i: 120, + alt_end_i: 131, + cigar: "7M1I2M".to_string(), + ..Default::default() + }, + ]; + + assert_eq!(format!("{}", &build_tx_cigar(&exons, -1)?), "4MI5M10N2MI7M"); + + Ok(()) + } + + #[test] + fn run_none_if_default() { + assert_eq!(none_if_default(0u32), None); + assert_eq!(none_if_default(1u32), Some(1u32)); + assert_eq!(none_if_default(1i32), Some(1i32)); + assert_eq!(none_if_default(-1i32), Some(-1i32)); + } + + fn get_config() -> Config { + Config { + db_url: std::env::var("TEST_UTA_DATABASE_URL") + .expect("Environment variable TEST_UTA_DATABASE_URL undefined!"), + db_schema: std::env::var("TEST_UTA_DATABASE_SCHEMA") + .expect("Environment variable TEST_UTA_DATABASE_SCHEMA undefined!"), + } + } + + #[test] + fn construction() -> Result<(), anyhow::Error> { + let config = get_config(); + let provider = Provider::with_config(&config)?; + + assert_eq!(provider.data_version(), config.db_schema); + assert_eq!(provider.schema_version(), "1.1"); + + Ok(()) + } + + #[test] + fn failures() -> Result<(), anyhow::Error> { + let config = get_config(); + let provider = Rc::new(Provider::with_config(&config)?); + + // unknown sequences + + assert!(AlignmentMapper::new(provider.clone(), "bogus", "NM_033089.6", "splign").is_err()); + assert!( + AlignmentMapper::new(provider.clone(), "bogus", "NM_033089.6", "transcript").is_err() + ); + assert!(AlignmentMapper::new(provider.clone(), "NM_033089.6", "bogus", "splign").is_err()); + assert!( + AlignmentMapper::new(provider.clone(), "NM_000051.3", "NC_000011.9", "bogus").is_err() + ); + + // invalid intervals + + { + let am = + AlignmentMapper::new(provider.clone(), "NM_000348.3", "NC_000002.11", "splign")?; + assert!(am + .n_to_c( + &TxInterval { + start: TxPos { + base: -1, + offset: None + }, + end: TxPos { + base: -1, + offset: None + }, + }, + true + ) + .is_err()); + } + + { + let am = AlignmentMapper::new(provider, "NM_000348.3", "NC_000002.11", "splign")?; + assert!(am + .c_to_n( + &CdsInterval { + start: CdsPos { + base: 99999, + offset: None, + cds_from: CdsFrom::Start + }, + end: CdsPos { + base: 99999, + offset: None, + cds_from: CdsFrom::Start + }, + }, + true + ) + .is_err()); + } + + Ok(()) + } + + /// Helper for running multiple projection cases. + fn run_test_cases( + tx_ac: &str, + alt_ac: &str, + cases: &Vec<(GenomeInterval, TxInterval, CdsInterval)>, + ) -> Result<(), anyhow::Error> { + let config = get_config(); + let provider = Rc::new(Provider::with_config(&config)?); + let mapper = AlignmentMapper::new(provider, tx_ac, alt_ac, "splign")?; + + for (g_interval, n_interval, c_interval) in cases { + assert_eq!( + c_interval, + mapper.g_to_c(g_interval, true)?.inner(), + "{}~{} {} mapper.g_to_c", + tx_ac, + alt_ac, + &g_interval + ); + assert_eq!( + c_interval, + &mapper.n_to_c(n_interval, true)?, + "{}~{} {} mapper.n_to_c", + tx_ac, + alt_ac, + &n_interval + ); + + assert_eq!( + g_interval, + mapper.c_to_g(c_interval, true)?.inner(), + "{}~{} {} mapper.c_to_g", + tx_ac, + alt_ac, + &c_interval + ); + assert_eq!( + Mu::Certain(g_interval.clone()), + mapper.n_to_g(n_interval, true)?, + "{}~{} {} mapper.n_to_g", + tx_ac, + alt_ac, + &g_interval + ); + + assert_eq!( + n_interval, + &mapper.c_to_n(c_interval, true)?, + "{}~{} {} mapper.c_to_n", + tx_ac, + alt_ac, + &c_interval + ); + assert_eq!( + Mu::Certain(n_interval.clone()), + mapper.g_to_n(g_interval, true)?, + "{}~{} {} mapper.g_to_n", + tx_ac, + alt_ac, + &n_interval + ); + } + + Ok(()) + } + + /// Use NM_178434.2 tests to test mapping with uncertain positions + // #[test] // not yet supported (same as Python package) + fn _test_lce3c_uncertain() -> Result<(), anyhow::Error> { + let tx_ac = "NM_178434.2"; + let alt_ac = "NC_000001.10"; + let test_cases = vec![ + ( + GenomeInterval::from_str("?_152573139")?, + TxInterval::from_str("?_2")?, + CdsInterval::from_str("?_-69")?, + ), + ( + GenomeInterval::from_str("152573138_?")?, + TxInterval::from_str("1_?")?, + CdsInterval::from_str("-70_?")?, + ), + ]; + + run_test_cases(tx_ac, alt_ac, &test_cases)?; + + Ok(()) + } + + /// NM_178434.2: LCE3C single exon, strand = +1, all coordinate input/output are in HGVS + #[test] + fn test_lce3c() -> Result<(), anyhow::Error> { + let tx_ac = "NM_178434.2"; + let alt_ac = "NC_000001.10"; + let test_cases = vec![ + // 5' + ( + GenomeInterval::from_str("152573138")?, + TxInterval::from_str("1")?, + CdsInterval::from_str("-70")?, + ), + ( + GenomeInterval::from_str("152573140")?, + TxInterval::from_str("3")?, + CdsInterval::from_str("-68")?, + ), + // CDS + ( + GenomeInterval::from_str("152573207")?, + TxInterval::from_str("70")?, + CdsInterval::from_str("-1")?, + ), + ( + GenomeInterval::from_str("152573208")?, + TxInterval::from_str("71")?, + CdsInterval::from_str("1")?, + ), + // 3' + ( + GenomeInterval::from_str("152573492")?, + TxInterval::from_str("355")?, + CdsInterval::from_str("285")?, + ), + ( + GenomeInterval::from_str("152573493")?, + TxInterval::from_str("356")?, + CdsInterval::from_str("*1")?, + ), + ( + GenomeInterval::from_str("152573560")?, + TxInterval::from_str("423")?, + CdsInterval::from_str("*68")?, + ), + ( + GenomeInterval::from_str("152573562")?, + TxInterval::from_str("425")?, + CdsInterval::from_str("*70")?, + ), + ]; + + run_test_cases(tx_ac, alt_ac, &test_cases)?; + + Ok(()) + } + + /// NM_033445.2: H2AW single exon, strand = -1, all coordinate input/output are in HGVS + #[test] + fn test_h2a2() -> Result<(), anyhow::Error> { + let tx_ac = "NM_033445.2"; + let alt_ac = "NC_000001.10"; + let test_cases = vec![ + // 3' + ( + GenomeInterval::from_str("228645560")?, + TxInterval::from_str("1")?, + CdsInterval::from_str("-42")?, + ), + ( + GenomeInterval::from_str("228645558")?, + TxInterval::from_str("3")?, + CdsInterval::from_str("-40")?, + ), + // CDS + ( + GenomeInterval::from_str("228645519")?, + TxInterval::from_str("42")?, + CdsInterval::from_str("-1")?, + ), + ( + GenomeInterval::from_str("228645518")?, + TxInterval::from_str("43")?, + CdsInterval::from_str("1")?, + ), + // 5' + ( + GenomeInterval::from_str("228645126")?, + TxInterval::from_str("435")?, + CdsInterval::from_str("393")?, + ), + ( + GenomeInterval::from_str("228645125")?, + TxInterval::from_str("436")?, + CdsInterval::from_str("*1")?, + ), + ( + GenomeInterval::from_str("228645124")?, + TxInterval::from_str("437")?, + CdsInterval::from_str("*2")?, + ), + ( + GenomeInterval::from_str("228645065")?, + TxInterval::from_str("496")?, + CdsInterval::from_str("*61")?, + ), + ]; + + run_test_cases(tx_ac, alt_ac, &test_cases)?; + + Ok(()) + } + + /// NM_014357.4: LCE2B, two exons, strand = +1, all coordinate input/output are in HGVS + #[test] + fn test_lce2b() -> Result<(), anyhow::Error> { + let tx_ac = "NM_014357.4"; + let alt_ac = "NC_000001.10"; + let test_cases = vec![ + // 5' + ( + GenomeInterval::from_str("152658599")?, + TxInterval::from_str("1")?, + CdsInterval::from_str("-54")?, + ), + ( + GenomeInterval::from_str("152658601")?, + TxInterval::from_str("3")?, + CdsInterval::from_str("-52")?, + ), + // CDS + ( + GenomeInterval::from_str("152659319")?, + TxInterval::from_str("54")?, + CdsInterval::from_str("-1")?, + ), + ( + GenomeInterval::from_str("152659320")?, + TxInterval::from_str("55")?, + CdsInterval::from_str("1")?, + ), + // around end of exon 1 + ( + GenomeInterval::from_str("152658632")?, + TxInterval::from_str("34")?, + CdsInterval::from_str("-21")?, + ), + ( + GenomeInterval::from_str("152658633")?, + TxInterval::from_str("34+1")?, + CdsInterval::from_str("-21+1")?, + ), + // span + ( + GenomeInterval::from_str("152658633_152659299")?, + TxInterval::from_str("34+1_35-1")?, + CdsInterval::from_str("-21+1_-20-1")?, + ), + // around start of exon 2 + ( + GenomeInterval::from_str("152659300")?, + TxInterval::from_str("35")?, + CdsInterval::from_str("-20")?, + ), + ( + GenomeInterval::from_str("152659299")?, + TxInterval::from_str("35-1")?, + CdsInterval::from_str("-20-1")?, + ), + // around end of exon 2 + ( + GenomeInterval::from_str("152659652")?, + TxInterval::from_str("387")?, + CdsInterval::from_str("333")?, + ), + ( + GenomeInterval::from_str("152659653")?, + TxInterval::from_str("388")?, + CdsInterval::from_str("*1")?, + ), + // span + ( + GenomeInterval::from_str("152659651_152659654")?, + TxInterval::from_str("386_389")?, + CdsInterval::from_str("332_*2")?, + ), + // 3' + ( + GenomeInterval::from_str("152659877")?, + TxInterval::from_str("612")?, + CdsInterval::from_str("*225")?, + ), + ]; + + run_test_cases(tx_ac, alt_ac, &test_cases)?; + + Ok(()) + } + + /// NM_178449.3: PTH2, two exons, strand = -1, all coordinate input/output are in HGVS + #[test] + fn test_pth2() -> Result<(), anyhow::Error> { + let tx_ac = "NM_178449.3"; + let alt_ac = "NC_000019.9"; + let test_cases = vec![ + // 3' + ( + GenomeInterval::from_str("49926698")?, + TxInterval::from_str("1")?, + CdsInterval::from_str("-102")?, + ), + // CDS + ( + GenomeInterval::from_str("49926597")?, + TxInterval::from_str("102")?, + CdsInterval::from_str("-1")?, + ), + ( + GenomeInterval::from_str("49926596")?, + TxInterval::from_str("103")?, + CdsInterval::from_str("1")?, + ), + // around end of exon 1 + ( + GenomeInterval::from_str("49926469")?, + TxInterval::from_str("230")?, + CdsInterval::from_str("128")?, + ), + ( + GenomeInterval::from_str("49926468")?, + TxInterval::from_str("230+1")?, + CdsInterval::from_str("128+1")?, + ), + // span + ( + GenomeInterval::from_str("49925901_49926467")?, + TxInterval::from_str("230+2_231-2")?, + CdsInterval::from_str("128+2_129-2")?, + ), + // around start of exons 2 + ( + GenomeInterval::from_str("49925900")?, + TxInterval::from_str("231-1")?, + CdsInterval::from_str("129-1")?, + ), + ( + GenomeInterval::from_str("49925899")?, + TxInterval::from_str("231")?, + CdsInterval::from_str("129")?, + ), + // around end of exon2 + ( + GenomeInterval::from_str("49925725")?, + TxInterval::from_str("405")?, + CdsInterval::from_str("303")?, + ), + ( + GenomeInterval::from_str("49925724")?, + TxInterval::from_str("406")?, + CdsInterval::from_str("*1")?, + ), + ( + GenomeInterval::from_str("49925671")?, + TxInterval::from_str("459")?, + CdsInterval::from_str("*54")?, + ), + ]; + + run_test_cases(tx_ac, alt_ac, &test_cases)?; + + Ok(()) + } +} diff --git a/src/mapper/cigar.rs b/src/mapper/cigar.rs new file mode 100644 index 0000000..91b0534 --- /dev/null +++ b/src/mapper/cigar.rs @@ -0,0 +1,565 @@ +//! Code supporting the `CigarMapper` + +use std::fmt::Display; + +use nom::{combinator::all_consuming, multi::many0}; + +/// CIGAR operation as parsed from UTA. +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] +pub enum CigarOp { + /// = + Eq, + /// D + Del, + /// I + Ins, + /// M + Match, + /// S + Skip, + /// X + Mismatch, +} + +impl CigarOp { + pub fn is_advance_ref(&self) -> bool { + matches!( + self, + CigarOp::Eq | CigarOp::Match | CigarOp::Mismatch | CigarOp::Ins | CigarOp::Skip + ) + } + + pub fn is_advance_tgt(&self) -> bool { + matches!( + self, + CigarOp::Eq | CigarOp::Match | CigarOp::Mismatch | CigarOp::Del + ) + } +} + +impl TryFrom for CigarOp { + type Error = anyhow::Error; + + fn try_from(value: char) -> Result { + Ok(match value { + '=' => Self::Eq, + 'D' => Self::Del, + 'I' => Self::Ins, + 'M' => Self::Match, + 'N' => Self::Skip, + 'X' => Self::Mismatch, + _ => return Err(anyhow::anyhow!("Invalid CIGAR character {}", value)), + }) + } +} + +impl From for char { + fn from(val: CigarOp) -> Self { + match val { + CigarOp::Eq => '=', + CigarOp::Del => 'D', + CigarOp::Ins => 'I', + CigarOp::Match => 'M', + CigarOp::Skip => 'N', + CigarOp::Mismatch => 'X', + } + } +} + +impl Display for CigarOp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", std::convert::Into::::into(*self)) + } +} + +/// CIGAR element consisting of count and CIGAR operation. +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Clone, Copy)] +pub struct CigarElement { + pub count: i32, + pub op: CigarOp, +} + +impl Display for CigarElement { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.count > 1 { + write!(f, "{}", self.count)?; + } + write!(f, "{}", self.op) + } +} + +impl CigarElement { + fn from_strs(count: &str, op: &str) -> CigarElement { + CigarElement { + count: if count.is_empty() { + 1 + } else { + str::parse(count).unwrap() + }, + op: op.chars().next().unwrap().try_into().unwrap(), + } + } +} + +#[derive(Debug, PartialEq, PartialOrd, Default, Clone)] +pub struct CigarString { + pub elems: Vec, +} + +impl CigarString { + fn from(elems: Vec) -> Self { + Self { elems } + } +} + +impl std::ops::Deref for CigarString { + type Target = Vec; + fn deref(&self) -> &Self::Target { + &self.elems + } +} +impl std::ops::DerefMut for CigarString { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.elems + } +} + +impl Display for CigarString { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + for item in &self.elems { + write!(f, "{}", &item)? + } + Ok(()) + } +} + +pub mod parse { + use nom::{ + bytes::complete::take_while_m_n, + character::complete::digit0, + error::{context, VerboseError}, + sequence::pair, + IResult, + }; + + type Res = IResult>; + + use super::CigarElement; + + pub fn is_cigar_op_char(c: char) -> bool { + "=DIMNX".contains(c) + } + + pub fn cigar_element(input: &str) -> Res<&str, CigarElement> { + context( + "cigar_element", + pair(digit0, take_while_m_n(1, 1, is_cigar_op_char)), + )(input) + .map(|(rest, (count, op))| (rest, CigarElement::from_strs(count, op))) + } +} + +/// Parse a CIGAR `str` into a real one. +pub fn parse_cigar_string(input: &str) -> Result { + Ok(CigarString::from( + all_consuming(many0(parse::cigar_element))(input) + .map_err(|e| anyhow::anyhow!("Problem with parsing: {:?}", e))? + .1, + )) +} + +/// Provide coordinate mapping between two sequences whose alignment is given by a CIGAR string. +/// +/// CIGAR is about alignments between positions in two sequences. It is base-centric. +/// +/// Unfortunately, base-centric coordinate systems require additional complexity to refer to +/// zero-width positions. +/// +/// This code uses interbase intervals. Interbase positions are zero-width boundaries between +/// bases. They often look similar to zero-based, right open coordinates. (But don't call them +/// that. It upsets me deeply.) The most important difference is that zero width intervals +/// neatly represent insertions between bases (or before or after the sequence). +#[derive(Default, Debug)] +pub struct CigarMapper { + pub cigar_string: CigarString, + pub ref_pos: Vec, + pub tgt_pos: Vec, + pub cigar_op: Vec, + pub ref_len: i32, + pub tgt_len: i32, +} + +#[derive(Debug, PartialEq)] +pub struct CigarMapperResult { + pub pos: i32, + pub offset: i32, + pub cigar_op: CigarOp, +} + +impl CigarMapper { + pub fn new(cigar_string: &CigarString) -> Self { + let (ref_pos, tgt_pos, cigar_op) = Self::init(cigar_string); + + Self { + cigar_string: cigar_string.clone(), + ref_len: *ref_pos.last().unwrap(), + tgt_len: *tgt_pos.last().unwrap(), + ref_pos, + tgt_pos, + cigar_op, + } + } + + /// For a given CIGAR string, return the start positions of each aligned segment in ref + /// and tgt, and a list of CIGAR operators. + fn init(cigar_string: &CigarString) -> (Vec, Vec, Vec) { + let cigar_len = cigar_string.len(); + + let mut ref_pos = vec![-1; cigar_len]; + let mut tgt_pos = vec![-1; cigar_len]; + let mut cigar_op = vec![CigarOp::Mismatch; cigar_len]; + let mut ref_cur = 0; + let mut tgt_cur = 0; + for (i, CigarElement { count, op }) in cigar_string.iter().enumerate() { + ref_pos[i] = ref_cur; + tgt_pos[i] = tgt_cur; + cigar_op[i] = *op; + if op.is_advance_ref() { + ref_cur += *count; + } + if op.is_advance_tgt() { + tgt_cur += *count; + } + } + ref_pos.push(ref_cur); + tgt_pos.push(tgt_cur); + + (ref_pos, tgt_pos, cigar_op) + } + + pub fn map_ref_to_tgt( + &self, + pos: i32, + end: &str, + strict_bounds: bool, + ) -> Result { + self.map(&self.ref_pos, &self.tgt_pos, pos, end, strict_bounds) + } + + pub fn map_tgt_to_ref( + &self, + pos: i32, + end: &str, + strict_bounds: bool, + ) -> Result { + self.map(&self.tgt_pos, &self.ref_pos, pos, end, strict_bounds) + } + + /// Map position between aligned segments. + /// + /// Positions in this function are 0-based, base-counting. + fn map( + &self, + from_pos: &[i32], + to_pos: &[i32], + pos: i32, + end: &str, + strict_bounds: bool, + ) -> Result { + if strict_bounds && (pos < 0 || pos > *from_pos.last().unwrap()) { + return Err(anyhow::anyhow!( + "Position is beyond the bounds of transcript record (pos={}, from_pos={:?}, to_pos={:?}]", + pos, + from_pos, + to_pos, + )); + } + + // Find aligned segment to use as basis for mapping. It is okay for pos to be + // before first element or after last. + let pos_i = { + let mut pos_i = 0; + while pos_i < self.cigar_op.len() { + if pos < from_pos[pos_i + 1] { + break; + } + pos_i += 1; + } + std::cmp::min(pos_i, self.cigar_op.len().saturating_sub(1)) + }; + + let cigar_op = self.cigar_op[pos_i]; + + if cigar_op == CigarOp::Eq || cigar_op == CigarOp::Match || cigar_op == CigarOp::Mismatch { + Ok(CigarMapperResult { + pos: to_pos[pos_i] + (pos - from_pos[pos_i]), + offset: 0, + cigar_op, + }) + } else if cigar_op == CigarOp::Del || cigar_op == CigarOp::Ins { + Ok(CigarMapperResult { + pos: if end == "start" { + to_pos[pos_i] - 1 + } else { + to_pos[pos_i] + }, + offset: 0, + cigar_op, + }) + } else if cigar_op == CigarOp::Skip { + if pos - from_pos[pos_i] < from_pos[pos_i + 1] - pos { + Ok(CigarMapperResult { + pos: to_pos[pos_i] - 1, + offset: pos - from_pos[pos_i] + 1, + cigar_op, + }) + } else { + Ok(CigarMapperResult { + pos: to_pos[pos_i], + offset: -(from_pos[pos_i + 1] - pos), + cigar_op, + }) + } + } else { + Err(anyhow::anyhow!("Algorithm error in CIGAR mapper")) + } + } +} + +#[cfg(test)] +mod test { + use pretty_assertions::assert_eq; + + use super::{parse_cigar_string, CigarElement, CigarMapper, CigarMapperResult, CigarOp}; + + #[test] + fn parse_cigar_string_simple() -> Result<(), anyhow::Error> { + // assert_eq!(parse_cigar_string("")?, vec![]); + assert_eq!( + parse_cigar_string("M")?.elems, + vec![CigarElement { + count: 1, + op: CigarOp::Match + }] + ); + assert_eq!( + parse_cigar_string("MM")?.elems, + vec![ + CigarElement { + count: 1, + op: CigarOp::Match + }, + CigarElement { + count: 1, + op: CigarOp::Match + } + ] + ); + assert_eq!( + parse_cigar_string("1M")?.elems, + vec![CigarElement { + count: 1, + op: CigarOp::Match, + },] + ); + assert_eq!( + parse_cigar_string("1M2I3X")?.elems, + vec![ + CigarElement { + count: 1, + op: CigarOp::Match, + }, + CigarElement { + count: 2, + op: CigarOp::Ins, + }, + CigarElement { + count: 3, + op: CigarOp::Mismatch, + }, + ] + ); + assert_eq!( + parse_cigar_string("1MI3X")?.elems, + vec![ + CigarElement { + count: 1, + op: CigarOp::Match, + }, + CigarElement { + count: 1, + op: CigarOp::Ins, + }, + CigarElement { + count: 3, + op: CigarOp::Mismatch, + }, + ] + ); + + Ok(()) + } + + #[test] + fn cigar_mapper_simple() -> Result<(), anyhow::Error> { + // 0 1 2 3 4 5 6 7 8 9 tgt + // = = = N N = X = N N N = I = D = + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ref + let cigar = "3=2N=X=3N=I=D=".to_string(); + let cigar_str = parse_cigar_string(&cigar)?; + let cigar_mapper = CigarMapper::new(&cigar_str); + + assert_eq!(cigar_mapper.ref_len, 15); + assert_eq!(cigar_mapper.tgt_len, 10); + assert_eq!(cigar_mapper.ref_pos.len(), cigar_mapper.tgt_pos.len()); + assert_eq!( + cigar_mapper.ref_pos, + vec![0, 3, 5, 6, 7, 8, 11, 12, 13, 14, 14, 15] + ); + assert_eq!( + cigar_mapper.tgt_pos, + vec![0, 3, 3, 4, 5, 6, 6, 7, 7, 8, 9, 10] + ); + + // ref to tgt + { + let cases = vec![ + (0, "start", 0, 0, CigarOp::Eq), + (0, "end", 0, 0, CigarOp::Eq), + (1, "start", 1, 0, CigarOp::Eq), + (1, "end", 1, 0, CigarOp::Eq), + (2, "start", 2, 0, CigarOp::Eq), + (2, "end", 2, 0, CigarOp::Eq), + (3, "start", 2, 1, CigarOp::Skip), + (3, "end", 2, 1, CigarOp::Skip), + (4, "start", 3, -1, CigarOp::Skip), + (4, "end", 3, -1, CigarOp::Skip), + (5, "start", 3, 0, CigarOp::Eq), + (5, "end", 3, 0, CigarOp::Eq), + (6, "start", 4, 0, CigarOp::Mismatch), + (6, "end", 4, 0, CigarOp::Mismatch), + (7, "start", 5, 0, CigarOp::Eq), + (7, "end", 5, 0, CigarOp::Eq), + (8, "start", 5, 1, CigarOp::Skip), + (8, "end", 5, 1, CigarOp::Skip), + (9, "start", 5, 2, CigarOp::Skip), + (9, "end", 5, 2, CigarOp::Skip), + (10, "start", 6, -1, CigarOp::Skip), + (10, "end", 6, -1, CigarOp::Skip), + (11, "start", 6, 0, CigarOp::Eq), + (11, "end", 6, 0, CigarOp::Eq), + (12, "start", 6, 0, CigarOp::Ins), + (12, "end", 7, 0, CigarOp::Ins), + (13, "start", 7, 0, CigarOp::Eq), + (13, "end", 7, 0, CigarOp::Eq), + (14, "start", 9, 0, CigarOp::Eq), + (14, "end", 9, 0, CigarOp::Eq), + ]; + for (arg_pos, arg_end, pos, offset, cigar_op) in cases { + assert_eq!( + cigar_mapper.map_ref_to_tgt(arg_pos, arg_end, true)?, + CigarMapperResult { + pos, + offset, + cigar_op + }, + "case = {:?}", + (arg_pos, arg_end, pos, offset, cigar_op) + ); + } + } + + // tgt to ref + { + let cases = vec![ + (0, "start", 0, 0, CigarOp::Eq), + (0, "end", 0, 0, CigarOp::Eq), + (1, "start", 1, 0, CigarOp::Eq), + (1, "end", 1, 0, CigarOp::Eq), + (2, "start", 2, 0, CigarOp::Eq), + (2, "end", 2, 0, CigarOp::Eq), + (3, "start", 5, 0, CigarOp::Eq), + (3, "end", 5, 0, CigarOp::Eq), + (4, "start", 6, 0, CigarOp::Mismatch), + (4, "end", 6, 0, CigarOp::Mismatch), + (5, "start", 7, 0, CigarOp::Eq), + (5, "end", 7, 0, CigarOp::Eq), + (6, "start", 11, 0, CigarOp::Eq), + (6, "end", 11, 0, CigarOp::Eq), + (7, "start", 13, 0, CigarOp::Eq), + (7, "end", 13, 0, CigarOp::Eq), + (8, "start", 13, 0, CigarOp::Del), + (8, "end", 14, 0, CigarOp::Del), + (9, "start", 14, 0, CigarOp::Eq), + (9, "end", 14, 0, CigarOp::Eq), + ]; + for (arg_pos, arg_end, pos, offset, cigar_op) in cases { + assert_eq!( + cigar_mapper.map_tgt_to_ref(arg_pos, arg_end, true)?, + CigarMapperResult { + pos, + offset, + cigar_op + }, + "case = {:?}", + (arg_pos, arg_end, pos, offset, cigar_op) + ); + } + } + + Ok(()) + } + + #[test] + fn cigar_mapper_strict_bounds() -> Result<(), anyhow::Error> { + // 0 1 2 3 4 5 6 7 8 9 tgt + // = = = N N = X = N N N = I = D = + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 ref + let cigar = "3=2N=X=3N=I=D=".to_string(); + let cigar_str = parse_cigar_string(&cigar)?; + let cigar_mapper = CigarMapper::new(&cigar_str); + + // error for out of bounds on left? + assert!(cigar_mapper.map_ref_to_tgt(-1, "start", true).is_err()); + // ... and right? + assert!(cigar_mapper + .map_ref_to_tgt(cigar_mapper.ref_len + 1, "start", true) + .is_err()); + + // test whether 1 base outside bounds results in correct position + assert_eq!( + cigar_mapper.map_ref_to_tgt(0, "start", true)?, + CigarMapperResult { + pos: 0, + offset: 0, + cigar_op: CigarOp::Eq, + } + ); + assert_eq!( + cigar_mapper.map_ref_to_tgt(-1, "start", false)?, + CigarMapperResult { + pos: -1, + offset: 0, + cigar_op: CigarOp::Eq, + } + ); + assert_eq!( + cigar_mapper.map_ref_to_tgt(cigar_mapper.ref_len, "start", true)?, + CigarMapperResult { + pos: cigar_mapper.tgt_len, + offset: 0, + cigar_op: CigarOp::Eq, + } + ); + assert_eq!( + cigar_mapper.map_ref_to_tgt(cigar_mapper.ref_len - 1, "start", false)?, + CigarMapperResult { + pos: cigar_mapper.tgt_len - 1, + offset: 0, + cigar_op: CigarOp::Eq, + } + ); + + Ok(()) + } +} diff --git a/src/mapper/mod.rs b/src/mapper/mod.rs new file mode 100644 index 0000000..87eca1b --- /dev/null +++ b/src/mapper/mod.rs @@ -0,0 +1,3 @@ +pub mod alignment; +pub mod cigar; +pub mod variant; diff --git a/src/mapper/variant.rs b/src/mapper/variant.rs new file mode 100644 index 0000000..cb1bc35 --- /dev/null +++ b/src/mapper/variant.rs @@ -0,0 +1,85 @@ +//! Code for variant mapping. + +use std::rc::Rc; + +use crate::{ + data::interface::Provider, + parser::HgvsVariant, + validator::{ValidationLevel, Validator}, +}; + +/// Configuration for Mapper. +/// +/// Defaults are taken from `hgvs` Python library. +#[derive(Debug, PartialEq, Clone)] +pub struct Config { + pub replace_reference: bool, + pub strict_validation: bool, + pub prevalidation_level: ValidationLevel, + pub add_gene_symbol: bool, +} + +impl Default for Config { + fn default() -> Self { + Self { + replace_reference: true, + strict_validation: false, + prevalidation_level: ValidationLevel::Extrinsic, + add_gene_symbol: false, + } + } +} + +pub struct Mapper { + config: Config, + provider: Rc, + validator: Box, +} + +impl Mapper { + fn new(config: &Config, provider: Rc) -> Mapper { + let validator = config + .prevalidation_level + .validator(config.strict_validation, provider.clone()); + Mapper { + config: config.clone(), + provider, + validator, + } + } + + fn config(&self) -> &Config { + &self.config + } + + /// Convert from genome to transcription. + /// + /// # Args + /// + /// * `var_g` -- `HgvsVariant::GenomeVariant` to project + /// * `tx_ac` -- accession of transcript to project to + /// * `alt_al_method` -- alignment method, e.g., `splign` + fn g_to_t( + &self, + var_g: HgvsVariant, + _tx_ac: &str, + _alt_aln_method: &str, + ) -> Result<(), anyhow::Error> { + if let HgvsVariant::GenomeVariant { + accession: _, + gene_symbol: _, + loc_edit: _, + } = &var_g + { + self.validator.validate(&var_g)?; + let _var_g = var_g.fill_ref(self.provider.as_ref())?; + + Ok(()) + } else { + Err(anyhow::anyhow!( + "Expected a GenomeVariant but received {}", + &var_g + )) + } + } +} diff --git a/src/parser/display.rs b/src/parser/display.rs index a0bff34..492f35a 100644 --- a/src/parser/display.rs +++ b/src/parser/display.rs @@ -144,8 +144,8 @@ impl Display for ProtPos { impl Display for ProtInterval { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.begin)?; - if self.begin != self.end { + write!(f, "{}", self.start)?; + if self.start != self.end { write!(f, "_{}", self.end)?; } Ok(()) @@ -172,8 +172,8 @@ impl Display for CdsLocEdit { impl Display for CdsInterval { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.begin)?; - if self.begin != self.end { + write!(f, "{}", self.start)?; + if self.start != self.end { write!(f, "_{}", self.end)?; } Ok(()) @@ -207,8 +207,8 @@ impl Display for TxLocEdit { impl Display for TxInterval { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.begin)?; - if self.begin != self.end { + write!(f, "{}", self.start)?; + if self.start != self.end { write!(f, "_{}", self.end)?; } Ok(()) @@ -238,8 +238,8 @@ impl Display for RnaLocEdit { impl Display for RnaInterval { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.begin)?; - if self.begin != self.end { + write!(f, "{}", self.start)?; + if self.start != self.end { write!(f, "_{}", self.end)?; } Ok(()) @@ -269,11 +269,11 @@ impl Display for GenomeLocEdit { impl Display for GenomeInterval { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self.begin { + match self.start { Some(begin) => write!(f, "{begin}")?, None => write!(f, "?")?, } - if self.begin != self.end { + if self.start != self.end { match self.end { Some(end) => write!(f, "_{end}")?, None => write!(f, "_?")?, @@ -291,11 +291,11 @@ impl Display for MtLocEdit { impl Display for MtInterval { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self.begin { + match self.start { Some(begin) => write!(f, "{begin}")?, None => write!(f, "?")?, } - if self.begin != self.end { + if self.start != self.end { match self.end { Some(end) => write!(f, "_{end}")?, None => write!(f, "_?")?, @@ -1025,7 +1025,7 @@ mod test { format!( "{}", CdsInterval { - begin: CdsPos { + start: CdsPos { base: 42, offset: Some(-10), cds_from: CdsFrom::Start, @@ -1044,7 +1044,7 @@ mod test { format!( "{}", CdsInterval { - begin: CdsPos { + start: CdsPos { base: 42, offset: Some(10), cds_from: CdsFrom::Start, @@ -1067,7 +1067,7 @@ mod test { "{}", CdsLocEdit { loc: Mu::Certain(CdsInterval { - begin: CdsPos { + start: CdsPos { base: 42, offset: Some(-10), cds_from: CdsFrom::Start, @@ -1130,7 +1130,7 @@ mod test { format!( "{}", TxInterval { - begin: TxPos { + start: TxPos { base: 42, offset: Some(-10), }, @@ -1147,7 +1147,7 @@ mod test { format!( "{}", TxInterval { - begin: TxPos { + start: TxPos { base: 42, offset: Some(10), }, @@ -1168,7 +1168,7 @@ mod test { "{}", TxLocEdit { loc: Mu::Certain(TxInterval { - begin: TxPos { + start: TxPos { base: 42, offset: Some(-10), }, @@ -1228,7 +1228,7 @@ mod test { format!( "{}", RnaInterval { - begin: RnaPos { + start: RnaPos { base: 42, offset: Some(-10), }, @@ -1245,7 +1245,7 @@ mod test { format!( "{}", RnaInterval { - begin: RnaPos { + start: RnaPos { base: 42, offset: Some(10), }, @@ -1266,7 +1266,7 @@ mod test { "{}", RnaLocEdit { loc: Mu::Certain(RnaInterval { - begin: RnaPos { + start: RnaPos { base: 42, offset: Some(-10), }, @@ -1291,7 +1291,7 @@ mod test { format!( "{}", GenomeInterval { - begin: None, + start: None, end: None } ), @@ -1302,7 +1302,7 @@ mod test { format!( "{}", GenomeInterval { - begin: Some(10), + start: Some(10), end: None } ), @@ -1313,7 +1313,7 @@ mod test { format!( "{}", GenomeInterval { - begin: None, + start: None, end: Some(10) } ), @@ -1324,7 +1324,7 @@ mod test { format!( "{}", GenomeInterval { - begin: Some(10), + start: Some(10), end: Some(20) } ), @@ -1335,7 +1335,7 @@ mod test { format!( "{}", GenomeInterval { - begin: Some(10), + start: Some(10), end: Some(10) } ), @@ -1350,7 +1350,7 @@ mod test { "{}", GenomeLocEdit { loc: Mu::Certain(GenomeInterval { - begin: Some(10), + start: Some(10), end: Some(20) }), edit: Mu::Certain(NaEdit::RefAlt { @@ -1369,7 +1369,7 @@ mod test { format!( "{}", MtInterval { - begin: None, + start: None, end: None } ), @@ -1380,7 +1380,7 @@ mod test { format!( "{}", MtInterval { - begin: Some(10), + start: Some(10), end: None } ), @@ -1391,7 +1391,7 @@ mod test { format!( "{}", MtInterval { - begin: None, + start: None, end: Some(10) } ), @@ -1402,7 +1402,7 @@ mod test { format!( "{}", MtInterval { - begin: Some(10), + start: Some(10), end: Some(20) } ), @@ -1413,7 +1413,7 @@ mod test { format!( "{}", MtInterval { - begin: Some(10), + start: Some(10), end: Some(10) } ), @@ -1428,7 +1428,7 @@ mod test { "{}", MtLocEdit { loc: Mu::Certain(MtInterval { - begin: Some(10), + start: Some(10), end: Some(20) }), edit: Mu::Certain(NaEdit::RefAlt { @@ -1461,7 +1461,7 @@ mod test { format!( "{}", ProtInterval { - begin: ProtPos { + start: ProtPos { aa: "Leu".to_string(), number: 42 }, @@ -1478,7 +1478,7 @@ mod test { format!( "{}", ProtInterval { - begin: ProtPos { + start: ProtPos { aa: "Leu".to_string(), number: 42 }, @@ -1499,7 +1499,7 @@ mod test { "{}", ProtLocEdit::Ordinary { loc: Mu::Certain(ProtInterval { - begin: ProtPos { + start: ProtPos { aa: "Leu".to_string(), number: 42 }, @@ -1543,7 +1543,7 @@ mod test { }), loc_edit: CdsLocEdit { loc: Mu::Certain(CdsInterval { - begin: CdsPos { + start: CdsPos { base: 100, offset: None, cds_from: CdsFrom::Start, @@ -1574,7 +1574,7 @@ mod test { gene_symbol: None, loc_edit: CdsLocEdit { loc: Mu::Certain(CdsInterval { - begin: CdsPos { + start: CdsPos { base: 100, offset: None, cds_from: CdsFrom::Start, @@ -1610,7 +1610,7 @@ mod test { }), loc_edit: GenomeLocEdit { loc: Mu::Certain(GenomeInterval { - begin: Some(100), + start: Some(100), end: Some(100) }), edit: Mu::Certain(NaEdit::RefAlt { @@ -1633,7 +1633,7 @@ mod test { gene_symbol: None, loc_edit: GenomeLocEdit { loc: Mu::Certain(GenomeInterval { - begin: Some(100), + start: Some(100), end: Some(100) }), edit: Mu::Certain(NaEdit::RefAlt { @@ -1661,7 +1661,7 @@ mod test { }), loc_edit: MtLocEdit { loc: Mu::Certain(MtInterval { - begin: Some(100), + start: Some(100), end: Some(100) }), edit: Mu::Certain(NaEdit::RefAlt { @@ -1684,7 +1684,7 @@ mod test { gene_symbol: None, loc_edit: MtLocEdit { loc: Mu::Certain(MtInterval { - begin: Some(100), + start: Some(100), end: Some(100) }), edit: Mu::Certain(NaEdit::RefAlt { @@ -1712,7 +1712,7 @@ mod test { }), loc_edit: TxLocEdit { loc: Mu::Certain(TxInterval { - begin: TxPos { + start: TxPos { base: 100, offset: None }, @@ -1741,7 +1741,7 @@ mod test { gene_symbol: None, loc_edit: TxLocEdit { loc: Mu::Certain(TxInterval { - begin: TxPos { + start: TxPos { base: 100, offset: None }, @@ -1775,7 +1775,7 @@ mod test { }), loc_edit: RnaLocEdit { loc: Mu::Certain(RnaInterval { - begin: RnaPos { + start: RnaPos { base: 100, offset: None }, @@ -1804,7 +1804,7 @@ mod test { gene_symbol: None, loc_edit: RnaLocEdit { loc: Mu::Certain(RnaInterval { - begin: RnaPos { + start: RnaPos { base: 100, offset: None }, diff --git a/src/parser/ds.rs b/src/parser/ds.rs index c131b0a..ba6a0df 100644 --- a/src/parser/ds.rs +++ b/src/parser/ds.rs @@ -9,6 +9,30 @@ pub enum Mu { Uncertain(T), } +impl Mu { + pub fn from(value: T, is_certain: bool) -> Self { + if is_certain { + Mu::Certain(value) + } else { + Mu::Uncertain(value) + } + } + + pub fn unwrap(self) -> T { + match self { + Mu::Certain(value) => value, + Mu::Uncertain(value) => value, + } + } + + pub fn inner(&self) -> &T { + match self { + Mu::Certain(value) => value, + Mu::Uncertain(value) => value, + } + } +} + /// Representation of gene symbol, e.g., `TTN` or `Ttn`. #[derive(Clone, Debug, PartialEq)] pub struct GeneSymbol { @@ -143,7 +167,7 @@ pub struct CdsLocEdit { #[derive(Clone, Debug, PartialEq)] pub struct CdsInterval { /// Start position - pub begin: CdsPos, + pub start: CdsPos, /// End position pub end: CdsPos, } @@ -180,7 +204,7 @@ pub struct GenomeLocEdit { #[derive(Clone, Debug, PartialEq)] pub struct GenomeInterval { /// Start position - pub begin: Option, + pub start: Option, /// End position pub end: Option, } @@ -198,7 +222,7 @@ pub struct MtLocEdit { #[derive(Clone, Debug, PartialEq)] pub struct MtInterval { /// Start position - pub begin: Option, + pub start: Option, /// End position pub end: Option, } @@ -216,7 +240,7 @@ pub struct TxLocEdit { #[derive(Clone, Debug, PartialEq)] pub struct TxInterval { /// Start position - pub begin: TxPos, + pub start: TxPos, /// End position pub end: TxPos, } @@ -243,7 +267,7 @@ pub struct RnaLocEdit { #[derive(Clone, Debug, PartialEq)] pub struct RnaInterval { /// Start position - pub begin: RnaPos, + pub start: RnaPos, /// End position pub end: RnaPos, } @@ -278,7 +302,7 @@ pub enum ProtLocEdit { #[derive(Clone, Debug, PartialEq)] pub struct ProtInterval { /// Start position - pub begin: ProtPos, + pub start: ProtPos, /// End position pub end: ProtPos, } @@ -291,3 +315,110 @@ pub struct ProtPos { /// Number of `aa`. pub number: i32, } + +#[cfg(test)] +mod test { + use pretty_assertions::assert_eq; + + use super::{TxInterval, TxPos}; + use crate::parser::Mu; + + #[test] + fn mu_construct() { + assert_eq!(format!("{:?}", Mu::Certain(1)), "Certain(1)"); + assert_eq!(format!("{:?}", Mu::Certain(Some(1))), "Certain(Some(1))"); + assert_eq!(format!("{:?}", Mu::Uncertain(1)), "Uncertain(1)"); + assert_eq!( + format!("{:?}", Mu::Uncertain(Some(1))), + "Uncertain(Some(1))" + ); + } + + #[test] + fn mu_unwrap() { + assert_eq!(Mu::Certain(1).unwrap(), 1); + assert_eq!(Mu::Uncertain(1).unwrap(), 1); + } + + #[test] + fn mu_inner() { + assert_eq!(Mu::Certain(1).inner(), &1); + assert_eq!(Mu::Uncertain(1).inner(), &1); + } + + #[test] + fn mu_from() { + assert_eq!(Mu::from(1, true), Mu::Certain(1)); + assert_eq!(Mu::from(1, false), Mu::Uncertain(1)); + assert_eq!(Mu::from(Some(1), true), Mu::Certain(Some(1))); + assert_eq!(Mu::from(Some(1), false), Mu::Uncertain(Some(1))); + } + + #[derive(Clone, Debug, PartialEq)] + pub struct TestInterval { + pub start: TestPos, + pub end: TestPos, + } + + #[derive(Clone, Debug, PartialEq)] + pub struct TestPos { + pub base: i32, + pub offset: Option, + } + + #[test] + fn mu_from_problematic() { + let test_itv = TestInterval { + start: TestPos { + base: 34, + offset: Some(1), + }, + end: TestPos { + base: 35, + offset: Some(-1), + }, + }; + + assert_eq!( + Mu::from(test_itv, true), + Mu::Certain(TestInterval { + start: TestPos { + base: 34, + offset: Some(1), + }, + end: TestPos { + base: 35, + offset: Some(-1), + } + }) + ); + } + + #[test] + fn mu_from_tx_interval() { + let test_itv = TxInterval { + start: TxPos { + base: 34, + offset: Some(1), + }, + end: TxPos { + base: 35, + offset: Some(-1), + }, + }; + + assert_eq!( + Mu::from(test_itv, true), + Mu::Certain(TxInterval { + start: TxPos { + base: 34, + offset: Some(1), + }, + end: TxPos { + base: 35, + offset: Some(-1), + } + }) + ); + } +} diff --git a/src/parser/impl_ops.rs b/src/parser/impl_ops.rs new file mode 100644 index 0000000..9e08450 --- /dev/null +++ b/src/parser/impl_ops.rs @@ -0,0 +1,16 @@ +//! Implementation of operations on the data structures. + +use crate::data::interface::Provider; + +use super::HgvsVariant; + +impl HgvsVariant { + /// Fill reference bases using the given data provider. + /// + /// # Args + /// + /// * `provider` -- The `Provider` to use for fetching reference bases. + pub fn fill_ref(&self, _provider: &dyn Provider) -> Result { + todo!() + } +} diff --git a/src/parser/impl_parse.rs b/src/parser/impl_parse.rs index 2d5dcdb..04bbb8a 100644 --- a/src/parser/impl_parse.rs +++ b/src/parser/impl_parse.rs @@ -14,7 +14,7 @@ use crate::parser::ds::*; use crate::parser::parse_funcs::*; impl HgvsVariant { - pub fn parse_cds_variant(input: &str) -> IResult<&str, Self> { + fn parse_cds_variant(input: &str) -> IResult<&str, Self> { map( tuple(( Accession::parse, @@ -362,7 +362,7 @@ mod test { }), loc_edit: CdsLocEdit { loc: Mu::Certain(CdsInterval { - begin: CdsPos { + start: CdsPos { base: 123, offset: None, cds_from: CdsFrom::Start @@ -394,7 +394,7 @@ mod test { }), loc_edit: GenomeLocEdit { loc: Mu::Certain(GenomeInterval { - begin: Some(123), + start: Some(123), end: Some(123), }), edit: Mu::Certain(NaEdit::RefAlt { @@ -418,7 +418,7 @@ mod test { }), loc_edit: MtLocEdit { loc: Mu::Certain(MtInterval { - begin: Some(123), + start: Some(123), end: Some(123), }), edit: Mu::Certain(NaEdit::RefAlt { @@ -442,7 +442,7 @@ mod test { }), loc_edit: TxLocEdit { loc: Mu::Certain(TxInterval { - begin: TxPos { + start: TxPos { base: 123, offset: None }, @@ -472,7 +472,7 @@ mod test { }), loc_edit: RnaLocEdit { loc: Mu::Certain(RnaInterval { - begin: RnaPos { + start: RnaPos { base: 123, offset: None }, @@ -502,7 +502,7 @@ mod test { }), loc_edit: ProtLocEdit::Ordinary { loc: Mu::Certain(ProtInterval { - begin: ProtPos { + start: ProtPos { aa: "Leu".to_string(), number: 3 }, @@ -528,7 +528,7 @@ mod test { "", CdsLocEdit { loc: Mu::Uncertain(CdsInterval { - begin: CdsPos { + start: CdsPos { base: 123, offset: None, cds_from: CdsFrom::Start @@ -556,7 +556,7 @@ mod test { "", GenomeLocEdit { loc: Mu::Uncertain(GenomeInterval { - begin: Some(123), + start: Some(123), end: Some(123), }), edit: Mu::Certain(NaEdit::RefAlt { @@ -576,7 +576,7 @@ mod test { "", MtLocEdit { loc: Mu::Uncertain(MtInterval { - begin: Some(123), + start: Some(123), end: Some(123), }), edit: Mu::Certain(NaEdit::RefAlt { @@ -596,7 +596,7 @@ mod test { "", TxLocEdit { loc: Mu::Uncertain(TxInterval { - begin: TxPos { + start: TxPos { base: 123, offset: None, }, @@ -622,7 +622,7 @@ mod test { "", RnaLocEdit { loc: Mu::Uncertain(RnaInterval { - begin: RnaPos { + start: RnaPos { base: 123, offset: None, }, @@ -648,7 +648,7 @@ mod test { "", ProtLocEdit::Ordinary { loc: Mu::Uncertain(ProtInterval { - begin: ProtPos { + start: ProtPos { aa: "Leu".to_string(), number: 123, }, @@ -682,7 +682,7 @@ mod test { Ok(( "", CdsInterval { - begin: CdsPos { + start: CdsPos { base: 123, offset: None, cds_from: CdsFrom::Start, @@ -704,7 +704,7 @@ mod test { Ok(( "", GenomeInterval { - begin: Some(123), + start: Some(123), end: Some(123), } )) @@ -718,7 +718,7 @@ mod test { Ok(( "", MtInterval { - begin: Some(123), + start: Some(123), end: Some(123), } )) @@ -732,7 +732,7 @@ mod test { Ok(( "", TxInterval { - begin: TxPos { + start: TxPos { base: 123, offset: None }, @@ -752,7 +752,7 @@ mod test { Ok(( "", RnaInterval { - begin: RnaPos { + start: RnaPos { base: 123, offset: None }, @@ -772,7 +772,7 @@ mod test { Ok(( "", ProtInterval { - begin: ProtPos { + start: ProtPos { aa: "Leu".to_string(), number: 123 }, diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 2c3a784..5bba051 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -6,6 +6,7 @@ mod display; mod ds; +mod impl_ops; mod impl_parse; mod parse_funcs; @@ -25,6 +26,36 @@ impl FromStr for HgvsVariant { } } +impl FromStr for GenomeInterval { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + Self::parse(s) + .map_err(|e| anyhow::anyhow!("problem with parsing HGVS genome interval: {:?}", &e)) + .map(|(_rest, g_interval)| g_interval) + } +} + +impl FromStr for TxInterval { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + Self::parse(s) + .map_err(|e| anyhow::anyhow!("problem with parsing HGVS transcript interval: {:?}", &e)) + .map(|(_rest, g_interval)| g_interval) + } +} + +impl FromStr for CdsInterval { + type Err = anyhow::Error; + + fn from_str(s: &str) -> Result { + Self::parse(s) + .map_err(|e| anyhow::anyhow!("problem with parsing HGVS CDS interval: {:?}", &e)) + .map(|(_rest, g_interval)| g_interval) + } +} + #[cfg(test)] mod test { use std::{ @@ -33,7 +64,9 @@ mod test { str::FromStr, }; - use crate::parser::{Accession, CdsFrom, CdsInterval, CdsLocEdit, CdsPos, Mu, NaEdit}; + use crate::parser::{ + Accession, CdsFrom, CdsInterval, CdsLocEdit, CdsPos, GenomeInterval, Mu, NaEdit, + }; use super::HgvsVariant; @@ -48,7 +81,7 @@ mod test { gene_symbol: None, loc_edit: CdsLocEdit { loc: Mu::Certain(CdsInterval { - begin: CdsPos { + start: CdsPos { base: 22, offset: Some(1), cds_from: CdsFrom::Start @@ -109,4 +142,47 @@ mod test { Ok(()) } + + // Test genome interval parsing. + #[test] + fn genome_interval_from_str() -> Result<(), anyhow::Error> { + assert!(GenomeInterval::from_str("x").is_err()); + assert_eq!( + GenomeInterval::from_str("1")?, + GenomeInterval { + start: Some(1), + end: Some(1) + } + ); + assert_eq!( + GenomeInterval::from_str("1_1")?, + GenomeInterval { + start: Some(1), + end: Some(1) + } + ); + assert_eq!( + GenomeInterval::from_str("?_1")?, + GenomeInterval { + start: None, + end: Some(1) + } + ); + assert_eq!( + GenomeInterval::from_str("1_?")?, + GenomeInterval { + start: Some(1), + end: None + } + ); + assert_eq!( + GenomeInterval::from_str("?_?")?, + GenomeInterval { + start: None, + end: None + } + ); + + Ok(()) + } } diff --git a/src/parser/parse_funcs.rs b/src/parser/parse_funcs.rs index 9d1575d..d3b5bdd 100644 --- a/src/parser/parse_funcs.rs +++ b/src/parser/parse_funcs.rs @@ -467,14 +467,14 @@ pub mod cds_pos { pub fn int(input: &str) -> IResult<&str, CdsInterval> { let (rest, (begin, _, end)) = tuple((pos, tag("_"), pos))(input)?; - Ok((rest, CdsInterval { begin, end })) + Ok((rest, CdsInterval { start: begin, end })) } pub fn loc(input: &str) -> IResult<&str, CdsInterval> { alt(( int, map(pos, |pos| CdsInterval { - begin: pos.clone(), + start: pos.clone(), end: pos, }), ))(input) @@ -502,7 +502,7 @@ pub mod genome_pos { pub fn int(input: &str) -> IResult<&str, GenomeInterval> { map(tuple((pos, tag("_"), pos)), |(begin, _, end)| { - GenomeInterval { begin, end } + GenomeInterval { start: begin, end } })(input) } @@ -510,7 +510,7 @@ pub mod genome_pos { alt(( int, map(pos, |pos| GenomeInterval { - begin: pos, + start: pos, end: pos, }), ))(input) @@ -538,7 +538,7 @@ pub mod mt_pos { pub fn int(input: &str) -> IResult<&str, MtInterval> { map(tuple((pos, tag("_"), pos)), |(begin, _, end)| MtInterval { - begin, + start: begin, end, })(input) } @@ -547,7 +547,7 @@ pub mod mt_pos { alt(( int, map(pos, |pos| MtInterval { - begin: pos, + start: pos, end: pos, }), ))(input) @@ -581,14 +581,14 @@ pub mod tx_pos { pub fn int(input: &str) -> IResult<&str, TxInterval> { let (rest, (begin, _, end)) = tuple((pos, tag("_"), pos))(input)?; - Ok((rest, TxInterval { begin, end })) + Ok((rest, TxInterval { start: begin, end })) } pub fn loc(input: &str) -> IResult<&str, TxInterval> { alt(( int, map(pos, |pos| TxInterval { - begin: pos.clone(), + start: pos.clone(), end: pos, }), ))(input) @@ -622,14 +622,14 @@ pub mod rna_pos { pub fn int(input: &str) -> IResult<&str, RnaInterval> { let (rest, (begin, _, end)) = tuple((pos, tag("_"), pos))(input)?; - Ok((rest, RnaInterval { begin, end })) + Ok((rest, RnaInterval { start: begin, end })) } pub fn loc(input: &str) -> IResult<&str, RnaInterval> { alt(( int, map(pos, |pos| RnaInterval { - begin: pos.clone(), + start: pos.clone(), end: pos, }), ))(input) @@ -664,14 +664,14 @@ pub mod prot_pos { pub fn int(input: &str) -> IResult<&str, ProtInterval> { let (rest, (begin, _, end)) = tuple((pos, tag("_"), pos))(input)?; - Ok((rest, ProtInterval { begin, end })) + Ok((rest, ProtInterval { start: begin, end })) } pub fn loc(input: &str) -> IResult<&str, ProtInterval> { alt(( int, map(pos, |pos| ProtInterval { - begin: pos.clone(), + start: pos.clone(), end: pos, }), ))(input) @@ -1606,7 +1606,7 @@ mod test { Ok(( "", MtInterval { - begin: Some(42), + start: Some(42), end: Some(42), } )) @@ -1620,7 +1620,7 @@ mod test { Ok(( "", MtInterval { - begin: Some(42), + start: Some(42), end: Some(100), } )) @@ -1630,7 +1630,7 @@ mod test { Ok(( "", MtInterval { - begin: None, + start: None, end: Some(100), } )) @@ -1640,7 +1640,7 @@ mod test { Ok(( "", MtInterval { - begin: Some(42), + start: Some(42), end: None, } )) @@ -1650,7 +1650,7 @@ mod test { Ok(( "", MtInterval { - begin: None, + start: None, end: None, } )) @@ -1664,7 +1664,7 @@ mod test { Ok(( "", CdsInterval { - begin: CdsPos { + start: CdsPos { base: 123, offset: None, cds_from: CdsFrom::Start, @@ -1686,7 +1686,7 @@ mod test { Ok(( "", CdsInterval { - begin: CdsPos { + start: CdsPos { base: 123, offset: None, cds_from: CdsFrom::Start, @@ -1704,7 +1704,7 @@ mod test { Ok(( "", CdsInterval { - begin: CdsPos { + start: CdsPos { base: 123, offset: None, cds_from: CdsFrom::Start, @@ -1722,7 +1722,7 @@ mod test { Ok(( "", CdsInterval { - begin: CdsPos { + start: CdsPos { base: -123, offset: None, cds_from: CdsFrom::Start, @@ -1740,7 +1740,7 @@ mod test { Ok(( "", CdsInterval { - begin: CdsPos { + start: CdsPos { base: 123, offset: Some(42), cds_from: CdsFrom::Start, @@ -1758,7 +1758,7 @@ mod test { Ok(( "", CdsInterval { - begin: CdsPos { + start: CdsPos { base: 123, offset: Some(42), cds_from: CdsFrom::Start, @@ -1776,7 +1776,7 @@ mod test { Ok(( "", CdsInterval { - begin: CdsPos { + start: CdsPos { base: -123, offset: Some(42), cds_from: CdsFrom::Start, @@ -1794,7 +1794,7 @@ mod test { Ok(( "", CdsInterval { - begin: CdsPos { + start: CdsPos { base: 123, offset: Some(-42), cds_from: CdsFrom::Start, @@ -1812,7 +1812,7 @@ mod test { Ok(( "", CdsInterval { - begin: CdsPos { + start: CdsPos { base: 123, offset: Some(-42), cds_from: CdsFrom::Start, @@ -1830,7 +1830,7 @@ mod test { Ok(( "", CdsInterval { - begin: CdsPos { + start: CdsPos { base: -123, offset: Some(-42), cds_from: CdsFrom::Start, @@ -1988,7 +1988,7 @@ mod test { Ok(( "", GenomeInterval { - begin: Some(42), + start: Some(42), end: Some(42), } )) @@ -2002,7 +2002,7 @@ mod test { Ok(( "", GenomeInterval { - begin: Some(42), + start: Some(42), end: Some(100), } )) @@ -2012,7 +2012,7 @@ mod test { Ok(( "", GenomeInterval { - begin: None, + start: None, end: Some(100), } )) @@ -2022,7 +2022,7 @@ mod test { Ok(( "", GenomeInterval { - begin: Some(42), + start: Some(42), end: None, } )) @@ -2032,7 +2032,7 @@ mod test { Ok(( "", GenomeInterval { - begin: None, + start: None, end: None, } )) @@ -2046,7 +2046,7 @@ mod test { Ok(( "", TxInterval { - begin: TxPos { + start: TxPos { base: 123, offset: None, }, @@ -2066,7 +2066,7 @@ mod test { Ok(( "", TxInterval { - begin: TxPos { + start: TxPos { base: 123, offset: None, }, @@ -2082,7 +2082,7 @@ mod test { Ok(( "", TxInterval { - begin: TxPos { + start: TxPos { base: 123, offset: None, }, @@ -2098,7 +2098,7 @@ mod test { Ok(( "", TxInterval { - begin: TxPos { + start: TxPos { base: -123, offset: None, }, @@ -2114,7 +2114,7 @@ mod test { Ok(( "", TxInterval { - begin: TxPos { + start: TxPos { base: 123, offset: Some(42), }, @@ -2130,7 +2130,7 @@ mod test { Ok(( "", TxInterval { - begin: TxPos { + start: TxPos { base: 123, offset: Some(42), }, @@ -2146,7 +2146,7 @@ mod test { Ok(( "", TxInterval { - begin: TxPos { + start: TxPos { base: -123, offset: Some(42), }, @@ -2162,7 +2162,7 @@ mod test { Ok(( "", TxInterval { - begin: TxPos { + start: TxPos { base: 123, offset: Some(-42), }, @@ -2178,7 +2178,7 @@ mod test { Ok(( "", TxInterval { - begin: TxPos { + start: TxPos { base: 123, offset: Some(-42), }, @@ -2194,7 +2194,7 @@ mod test { Ok(( "", TxInterval { - begin: TxPos { + start: TxPos { base: -123, offset: Some(-42), }, @@ -2308,7 +2308,7 @@ mod test { Ok(( "", RnaInterval { - begin: RnaPos { + start: RnaPos { base: 123, offset: None, }, @@ -2328,7 +2328,7 @@ mod test { Ok(( "", RnaInterval { - begin: RnaPos { + start: RnaPos { base: 123, offset: None, }, @@ -2344,7 +2344,7 @@ mod test { Ok(( "", RnaInterval { - begin: RnaPos { + start: RnaPos { base: 123, offset: None, }, @@ -2360,7 +2360,7 @@ mod test { Ok(( "", RnaInterval { - begin: RnaPos { + start: RnaPos { base: -123, offset: None, }, @@ -2376,7 +2376,7 @@ mod test { Ok(( "", RnaInterval { - begin: RnaPos { + start: RnaPos { base: 123, offset: Some(42), }, @@ -2392,7 +2392,7 @@ mod test { Ok(( "", RnaInterval { - begin: RnaPos { + start: RnaPos { base: 123, offset: Some(42), }, @@ -2408,7 +2408,7 @@ mod test { Ok(( "", RnaInterval { - begin: RnaPos { + start: RnaPos { base: -123, offset: Some(42), }, @@ -2424,7 +2424,7 @@ mod test { Ok(( "", RnaInterval { - begin: RnaPos { + start: RnaPos { base: 123, offset: Some(-42), }, @@ -2440,7 +2440,7 @@ mod test { Ok(( "", RnaInterval { - begin: RnaPos { + start: RnaPos { base: 123, offset: Some(-42), }, @@ -2456,7 +2456,7 @@ mod test { Ok(( "", RnaInterval { - begin: RnaPos { + start: RnaPos { base: -123, offset: Some(-42), }, @@ -2624,7 +2624,7 @@ mod test { Ok(( "", ProtInterval { - begin: ProtPos { + start: ProtPos { aa: "Leu".to_string(), number: 10 }, @@ -2644,7 +2644,7 @@ mod test { Ok(( "", ProtInterval { - begin: ProtPos { + start: ProtPos { aa: "Leu".to_string(), number: 10 }, @@ -2660,7 +2660,7 @@ mod test { Ok(( "", ProtInterval { - begin: ProtPos { + start: ProtPos { aa: "L".to_string(), number: 10 }, @@ -2676,7 +2676,7 @@ mod test { Ok(( "", ProtInterval { - begin: ProtPos { + start: ProtPos { aa: "Leu".to_string(), number: 10 }, @@ -2692,7 +2692,7 @@ mod test { Ok(( "", ProtInterval { - begin: ProtPos { + start: ProtPos { aa: "L".to_string(), number: 10 }, @@ -2708,7 +2708,7 @@ mod test { Ok(( "", ProtInterval { - begin: ProtPos { + start: ProtPos { aa: "L".to_string(), number: 10 }, diff --git a/src/static_data/mod.rs b/src/static_data/mod.rs index 7ae2e63..fb9ae4c 100644 --- a/src/static_data/mod.rs +++ b/src/static_data/mod.rs @@ -7,11 +7,13 @@ use flate2::read::GzDecoder; use serde::Deserialize; const GRCH37_JSON_GZ: &[u8] = include_bytes!("_data/GRCh37.json.gz"); +const GRCH37_P10_JSON_GZ: &[u8] = include_bytes!("_data/GRCh37.p10.json.gz"); const GRCH38_JSON_GZ: &[u8] = include_bytes!("_data/GRCh38.json.gz"); #[derive(Debug, Deserialize, Enum)] pub enum Assembly { Grch37, + Grch37p10, Grch38, } @@ -20,6 +22,7 @@ impl Assembly { fn load_assembly_info(&self) -> AssemblyInfo { let payload = match self { Assembly::Grch37 => GRCH37_JSON_GZ, + Assembly::Grch37p10 => GRCH37_P10_JSON_GZ, Assembly::Grch38 => GRCH38_JSON_GZ, }; let mut d = GzDecoder::new(payload); @@ -56,6 +59,7 @@ lazy_static::lazy_static! { /// Provide information about the assemblies. pub static ref ASSEMBLY_INFOS: EnumMap = enum_map! { Assembly::Grch37 => Assembly::Grch37.load_assembly_info(), + Assembly::Grch37p10 => Assembly::Grch37p10.load_assembly_info(), Assembly::Grch38 => Assembly::Grch38.load_assembly_info(), }; } @@ -69,6 +73,7 @@ mod test { #[test] fn smoke() { assert_eq!(ASSEMBLY_INFOS[Assembly::Grch37].sequences.len(), 92); + assert_eq!(ASSEMBLY_INFOS[Assembly::Grch37p10].sequences.len(), 275); assert_eq!(ASSEMBLY_INFOS[Assembly::Grch38].sequences.len(), 455); } } diff --git a/src/validator/mod.rs b/src/validator/mod.rs new file mode 100644 index 0000000..895d52b --- /dev/null +++ b/src/validator/mod.rs @@ -0,0 +1,89 @@ +//! Implementation of validation. + +use std::rc::Rc; + +use crate::{data::interface::Provider, parser::HgvsVariant}; + +#[derive(Debug, PartialEq, Clone, Copy)] +pub enum ValidationLevel { + Null, + Intrinsic, + Extrinsic, +} + +impl ValidationLevel { + pub fn validator(&self, strict: bool, provider: Rc) -> Box { + match self { + ValidationLevel::Null => Box::new(NullValidator::new(strict)), + ValidationLevel::Intrinsic => Box::new(IntrinsicValidator::new(strict)), + ValidationLevel::Extrinsic => Box::new(ExtrinsicValidator::new(strict, provider)), + } + } +} + +pub trait Validator { + fn is_strict(&self) -> bool; + + fn validate(&self, var: &HgvsVariant) -> Result<(), anyhow::Error>; +} + +pub struct NullValidator { + strict: bool, +} + +impl NullValidator { + pub fn new(strict: bool) -> Self { + Self { strict } + } +} + +impl Validator for NullValidator { + fn is_strict(&self) -> bool { + self.strict + } + + fn validate(&self, _var: &HgvsVariant) -> Result<(), anyhow::Error> { + todo!() + } +} + +pub struct IntrinsicValidator { + strict: bool, +} + +impl IntrinsicValidator { + pub fn new(strict: bool) -> Self { + Self { strict } + } +} + +impl Validator for IntrinsicValidator { + fn is_strict(&self) -> bool { + self.strict + } + + fn validate(&self, _var: &HgvsVariant) -> Result<(), anyhow::Error> { + todo!() + } +} + +pub struct ExtrinsicValidator { + strict: bool, + provider: Rc, +} + +impl ExtrinsicValidator { + pub fn new(strict: bool, provider: Rc) -> Self { + Self { strict, provider } + } +} + +impl Validator for ExtrinsicValidator { + fn is_strict(&self) -> bool { + self.strict + } + + fn validate(&self, _var: &HgvsVariant) -> Result<(), anyhow::Error> { + todo!() + } +} diff --git a/tests/data/data/bootstrap.sh b/tests/data/data/bootstrap.sh index a231053..2bc0cab 100644 --- a/tests/data/data/bootstrap.sh +++ b/tests/data/data/bootstrap.sh @@ -64,7 +64,7 @@ VERSION=$2 DST=$SCRIPT_DIR # The HGNC symbols of the genes to fetc. -GENES="OMA1 OPA1" +GENES="OMA1 OPA1 LCE3C H2AW LCE2B PTH2 SRD5A2" # Transform gene list for postgres query. PG_GENES=$(pg-list $GENES) @@ -78,12 +78,12 @@ cd $DST # download database dumps -mkdir -p download -cd download -for f in $VERSION.pgd.gz{,.sha1}; do - test -e $f || wget $DL_URL/$f -done -cd .. +#mkdir -p download +#cd download +#for f in $VERSION.pgd.gz{,.sha1}; do +# test -e $f || wget $DL_URL/$f +#done +#cd .. # extract identifiers diff --git a/tests/data/data/uta_20210129-subset.pgd.gz b/tests/data/data/uta_20210129-subset.pgd.gz index cd14e73..4f187c3 100644 Binary files a/tests/data/data/uta_20210129-subset.pgd.gz and b/tests/data/data/uta_20210129-subset.pgd.gz differ