diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 61c2dc2..15b214d 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -100,7 +100,7 @@ jobs: uses: actions-rs/tarpaulin@v0.1 with: version: 0.21.0 - args: "-- --test-threads 1 ${{ matrix.flag }}" + args: "-- --test-threads 1" env: TEST_UTA_DATABASE_URL: postgres://uta_admin:uta_admin@0.0.0.0/uta TEST_UTA_DATABASE_SCHEMA: uta_20210129 @@ -112,7 +112,7 @@ jobs: uses: actions-rs/cargo@v1 with: command: test - args: "--release -- ${{ matrix.flag }}" + args: "--release -- --include-ignored" env: TEST_UTA_DATABASE_URL: postgres://uta_admin:uta_admin@0.0.0.0/uta TEST_UTA_DATABASE_SCHEMA: uta_20210129 diff --git a/Cargo.toml b/Cargo.toml index bf6e15c..0326479 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,5 +27,6 @@ serde_json = "1.0.93" [dev-dependencies] csv = "1.2.0" env_logger = "0.10.0" +rstest = "0.16.0" serde = { version = "1.0.152", features = ["derive"] } test-log = "0.2.11" diff --git a/README.md b/README.md index a5167a3..f46d796 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ export TEST_SEQREPO_CACHE_PATH=tests/data/seqrepo_cache.fasta ``` When running the tests with `cargo test`, the cache file will be (re-)written. -Note that you have to use `cargo test --release -- --test-threads 1 --ignored` when writing the cache for enforcing a single test writing to the cache at any time. +Note that you have to use `cargo test --release -- --test-threads 1 --include-ignored` when writing the cache for enforcing a single test writing to the cache at any time. If you don't want to regenerate the cache then you can use the following settings. With these settings, the cache will only be read. diff --git a/src/mapper/alignment.rs b/src/mapper/alignment.rs index 9271c7d..295f7b8 100644 --- a/src/mapper/alignment.rs +++ b/src/mapper/alignment.rs @@ -339,7 +339,7 @@ impl Mapper { .map_tgt_to_ref(frs, "start", self.config.strict_bounds)?; let gre = self .cigar_mapper - .map_tgt_to_ref(fre, "start", self.config.strict_bounds)?; + .map_tgt_to_ref(fre, "end", self.config.strict_bounds)?; let (grs_pos, gre_pos) = (grs.pos + self.gc_offset + 1, gre.pos + self.gc_offset + 1); let (gs, ge) = (grs_pos + start_offset, gre_pos + end_offset); diff --git a/src/mapper/assembly.rs b/src/mapper/assembly.rs new file mode 100644 index 0000000..330b4f6 --- /dev/null +++ b/src/mapper/assembly.rs @@ -0,0 +1,954 @@ +//! Implementation of `AssemblyMapper`. + +use std::collections::{HashMap, HashSet}; +use std::ops::Range; +use std::rc::Rc; + +use crate::mapper::variant::{Config as VariantMapperConfig, Mapper as VariantMapper}; +use crate::parser::HgvsVariant; +use crate::{data::interface::Provider, static_data::Assembly, validator::ValidationLevel}; + +#[derive(Debug, PartialEq, Eq, Default)] +pub enum InParAssume { + #[default] + X, + Y, + None, +} + +impl InParAssume { + fn chrom_name(&self) -> &str { + match &self { + InParAssume::X => "X", + InParAssume::Y => "Y", + InParAssume::None => panic!("no chromosome name"), + } + } +} + +/// Configuration for `Assemblymapper`. +#[derive(Debug)] +pub struct Config { + pub assembly: Assembly, + pub alt_aln_method: String, + pub normalize: bool, + pub in_par_assume: InParAssume, + pub prevalidation_level: ValidationLevel, + pub replace_reference: bool, + pub strict_validation: bool, + pub strict_bounds: bool, + pub add_gene_symbol: bool, +} + +impl Default for Config { + fn default() -> Self { + Self { + assembly: Assembly::Grch38, + alt_aln_method: "splign".to_string(), + normalize: true, + in_par_assume: InParAssume::X, + prevalidation_level: ValidationLevel::Intrinsic, + replace_reference: true, + strict_validation: true, + strict_bounds: true, + add_gene_symbol: false, + } + } +} + +/// Provides simplified variant mapping for a single assembly and transcript-reference +/// alignment method. +/// +/// `AssemblyMapper` uses `VariantMapper`, to provide all projection functionality, and add: +/// +/// * Automatic selection of genomic sequence accession +/// * Transcript selection from genomic coordinates +/// * Normalization after projection +/// * Special handling for PAR regions +/// +/// `AssemblyMapper` is instantiated with an assembly name and `alt_aln_method`. These enable +/// the following conveniences over `VariantMapper`: +/// +/// * The assembly and alignment method are used to automatically select an appropriate +/// chromosomal reference sequence when mapping from a transcript to a genome (i.e., +/// `c_to_g()` and `n_to_g()`). +/// +/// * A new method, relevant_trancripts(g_variant), returns a list of transcript accessions +/// available for the specified variant. These accessions are candidates mapping from +/// genomic to trancript coordinates (i.e., g_to_c(...) and g_to_n(...)). +/// +/// Note: AssemblyMapper supports only chromosomal references (e.g. NC_000006.11). It does +/// not support contigs or other genomic sequences (e.g., NT_167249.1). +pub struct Mapper { + config: Config, + provider: Rc<dyn Provider>, + inner: VariantMapper, + /// Accessions of contigs in assembly. + asm_accessions: HashSet<String>, + /// Map from accession to contig name. + asm_map: HashMap<String, String>, +} + +impl Mapper { + /// Construct new assembly mapper from config and provider. + pub fn new(config: Config, provider: Rc<dyn Provider>) -> Self { + let inner_config = VariantMapperConfig { + replace_reference: config.replace_reference, + strict_validation: config.strict_validation, + prevalidation_level: config.prevalidation_level, + add_gene_symbol: config.add_gene_symbol, + strict_bounds: config.strict_bounds, + }; + let inner = VariantMapper::new(&inner_config, provider.clone()); + let asm_accessions = provider + .as_ref() + .get_assembly_map(config.assembly) + .keys() + .clone() + .into_iter() + .map(|s| s.to_string()) + .collect::<HashSet<_>>(); + let asm_map = HashMap::from_iter( + provider + .as_ref() + .get_assembly_map(config.assembly) + .iter() + .map(|(key, value)| (key.clone(), value.clone())), + ); + Self { + config, + provider, + inner, + asm_accessions, + asm_map, + } + } + + /// Convert from genome (g.) variant to transcript variant (n.). + /// + /// # Args + /// + /// * `var_g` -- `HgvsVariant::GenomeVariant` to project + /// * `tx_ac` -- accession of transcript to project to + pub fn g_to_n(&self, var_g: &HgvsVariant, tx_ac: &str) -> Result<HgvsVariant, anyhow::Error> { + let var = self + .inner + .g_to_n(var_g, tx_ac, &self.config.alt_aln_method)?; + self.maybe_normalize(&var) + } + + /// Convert from genome (g.) variant to CDS variant (c.). + /// + /// # Args + /// + /// * `var_g` -- `HgvsVariant::GenomeVariant` to project + /// * `tx_ac` -- accession of transcript to project to + pub fn g_to_c(&self, var_g: &HgvsVariant, tx_ac: &str) -> Result<HgvsVariant, anyhow::Error> { + let var = self + .inner + .g_to_c(var_g, tx_ac, &self.config.alt_aln_method)?; + self.maybe_normalize(&var) + } + + /// Convert from genome (g.) variant to transcript variant (g. or n.). + /// + /// # Args + /// + /// * `var_g` -- `HgvsVariant::GenomeVariant` to project + /// * `tx_ac` -- accession of transcript to project to + pub fn g_to_t(&self, var_g: &HgvsVariant, tx_ac: &str) -> Result<HgvsVariant, anyhow::Error> { + let var = self + .inner + .g_to_t(var_g, tx_ac, &self.config.alt_aln_method)?; + self.maybe_normalize(&var) + } + + /// Convert from transcript variant (g.) variant to genome variant (n.). + /// + /// # Args + /// + /// * `var_n` -- `HgvsVariant::TxVariant` to project + pub fn n_to_g(&self, var_n: &HgvsVariant) -> Result<HgvsVariant, anyhow::Error> { + let alt_ac = self.alt_ac_for_tx_ac(var_n.accession())?; + let var = self + .inner + .n_to_g(var_n, &alt_ac, &self.config.alt_aln_method)?; + self.maybe_normalize(&var) + } + + /// Convert from CDS variant (c.) to genome variant (g.). + /// + /// # Args + /// + /// * `var_c` -- `HgvsVariant::CdsVariant` to project + /// * `alt_ac` -- alternative contig accession + /// * `alt_al_method` -- alignment method, e.g., `"splign"` + pub fn c_to_g(&self, var_c: &HgvsVariant) -> Result<HgvsVariant, anyhow::Error> { + let alt_ac = self.alt_ac_for_tx_ac(var_c.accession())?; + let var = self + .inner + .c_to_g(var_c, &alt_ac, &self.config.alt_aln_method)?; + self.maybe_normalize(&var) + } + + /// Convert from transcript (c. or n.) to genome (g.) variant. + /// + /// # Args + /// + /// * `var_t` -- `HgvsVariant::TxVariant` or `HgvsVariant::CdsVariant` to project + /// * `alt_ac` -- accession of alternativ esequence + /// * `alt_al_method` -- alignment method, e.g., `"splign"` + pub fn t_to_g(&self, var_t: &HgvsVariant) -> Result<HgvsVariant, anyhow::Error> { + let alt_ac = self.alt_ac_for_tx_ac(var_t.accession())?; + let var = self + .inner + .t_to_g(var_t, &alt_ac, &self.config.alt_aln_method)?; + self.maybe_normalize(&var) + } + + /// Convert from CDS variant (c.) to transcript variant (n.). + /// + /// # Args + /// + /// * `var_c` -- `HgvsVariant::CdsVariant` to project + pub fn c_to_n(&self, var_c: &HgvsVariant) -> Result<HgvsVariant, anyhow::Error> { + let var = self.inner.c_to_n(var_c)?; + self.maybe_normalize(&var) + } + + /// Convert from transcript variant (n.) to CDS variant (c.). + /// + /// # Args + /// + /// * `var_n` -- `HgvsVariant::TxVariant` to project + pub fn n_to_c(&self, var_n: &HgvsVariant) -> Result<HgvsVariant, anyhow::Error> { + let var = self.inner.n_to_c(var_n)?; + self.maybe_normalize(&var) + } + + /// Convert from CDS variant (c.) to protein variant (p.). + /// + /// # Args + /// + /// * `var_c` -- `HgvsVariant::TxVariant` to project + pub fn c_to_p(&self, var_c: &HgvsVariant) -> Result<HgvsVariant, anyhow::Error> { + let var = self.inner.c_to_p(var_c, None)?; + self.maybe_normalize(&var) + } + + /// Obtain relevant transcript accessions. + /// + /// # Args + /// + /// * `var_g` -- genome variant to obtain the transcript accessions for. + /// + /// # Returns + /// + /// `Vec` of relevant transcript accessions. + pub fn relevant_transcripts(&self, var_g: &HgvsVariant) -> Result<Vec<String>, anyhow::Error> { + match var_g { + HgvsVariant::GenomeVariant { loc_edit, .. } => { + let r: Range<i32> = loc_edit.loc.inner().clone().try_into()?; + Ok(self + .provider + .as_ref() + .get_tx_for_region( + var_g.accession(), + &self.config.alt_aln_method, + r.start, + r.end, + )? + .into_iter() + .map(|rec| rec.tx_ac) + .collect::<Vec<_>>()) + } + _ => return Err(anyhow::anyhow!("Not a GenomeVariant: {}", &var_g)), + } + } + + /// Normalize variant if requested and ignore errors. This is better than checking whether + /// the variant is intronic because future UTAs will support LRG, which will enable checking + /// intronic variants. + fn maybe_normalize(&self, var: &HgvsVariant) -> Result<HgvsVariant, anyhow::Error> { + if self.config.normalize { + let normalizer = self.inner.normalizer()?; + normalizer.normalize(var).or_else(|_| { + log::warn!("Normalization of variable {} failed", &var); + Ok(var.clone()) + }) + } else { + Ok(var.clone()) + } + } + + /// Return chromosomal accession for the given transcript accession and the assembly and + /// aln_method from configuration. + fn alt_ac_for_tx_ac(&self, tx_ac: &str) -> Result<String, anyhow::Error> { + let mut alt_acs = Vec::new(); + for record in self.provider.get_tx_mapping_options(tx_ac)? { + if record.alt_aln_method == self.config.alt_aln_method + && self.asm_accessions.contains(&record.alt_ac) + { + alt_acs.push(record.alt_ac); + } + } + + if alt_acs.is_empty() { + return Err(anyhow::anyhow!( + "No alignments for {} in {:?} using {}", + tx_ac, + self.config.assembly, + self.config.alt_aln_method + )); + } + + if alt_acs.len() == 1 { + Ok(alt_acs.first().unwrap().to_owned()) + } else { + // alt_acs.len() > 1 + // Perform sanity check on result if more than one contig is returned. + let mut alts = alt_acs + .iter() + .map(|ac| self.asm_map.get(ac)) + .filter(|x| x.is_some()) + .map(|x| x.unwrap().clone()) + .collect::<Vec<_>>(); + alts.sort(); + if alts.join("") != "XY" { + return Err(anyhow::anyhow!( + "Multiple chromosomal alignments for {} in {:?} using {} \ + (non-pseudoautosomal region) [{:?}]", + tx_ac, + self.config.assembly, + self.config.alt_aln_method, + &alts + )); + } + + // Assume PAR + if self.config.in_par_assume == InParAssume::None { + return Err(anyhow::anyhow!( + "Multiple chromosomal alignments for {} in {:?} using {} \ + (likely pseudoautosomal region)", + tx_ac, + self.config.assembly, + self.config.alt_aln_method, + )); + } + + let alt_acs = alt_acs + .into_iter() + .filter(|ac| ac == self.config.in_par_assume.chrom_name()) + .collect::<Vec<_>>(); + if alt_acs.len() != 1 { + Err(anyhow::anyhow!( + "Multiple chromosomal alignments for {} in {:?} using {} \ + (in_par_assume={:?} selected {} of them)", + tx_ac, + self.config.assembly, + self.config.alt_aln_method, + self.config.in_par_assume, + alt_acs.len() + )) + } else { + Ok(alt_acs.first().unwrap().to_owned()) + } + } + } + + pub fn replace_reference(&self, var: HgvsVariant) -> Result<HgvsVariant, anyhow::Error> { + self.inner.replace_reference(var) + } +} + +#[cfg(test)] +mod test { + use crate::{data::uta_sr::test_helpers::build_provider, static_data::Assembly}; + + use super::{Config, Mapper}; + + fn build_mapper_38(normalize: bool) -> Result<Mapper, anyhow::Error> { + let provider = build_provider()?; + let config = Config { + assembly: Assembly::Grch38, + normalize, + ..Config::default() + }; + Ok(Mapper::new(config, provider)) + } + + fn build_mapper_37(normalize: bool) -> Result<Mapper, anyhow::Error> { + let provider = build_provider()?; + let config = Config { + assembly: Assembly::Grch37, + normalize, + ..Config::default() + }; + Ok(Mapper::new(config, provider)) + } + + #[test] + fn smoke() -> Result<(), anyhow::Error> { + build_mapper_38(true)?; + Ok(()) + } + + /// The following is a port of `Test_VariantMapper` in + /// `test_hgvs_variantmapper_near_discrepancies.py` (sic!) + mod cases { + use std::str::FromStr; + + use rstest::rstest; + + use crate::parser::{HgvsVariant, NoRef}; + + use super::{build_mapper_37, build_mapper_38}; + + #[test] + fn test_quick_aoah() -> Result<(), anyhow::Error> { + let mapper = build_mapper_38(true)?; + let hgvs_g = "NC_000007.13:g.36561662C>T"; + let hgvs_c = "NM_001637.3:c.1582G>A"; + let hgvs_p = "NP_001628.1:p.Gly528Arg"; + + let var_g = HgvsVariant::from_str(hgvs_g)?; + let var_c = mapper.g_to_c(&var_g, "NM_001637.3")?; + let var_p = mapper.c_to_p(&var_c)?; + + assert_eq!(format!("{}", var_c), hgvs_c); + assert_eq!(format!("{}", var_p), hgvs_p); + + Ok(()) + } + + #[test] + fn test_fail_c_to_p_brca2() -> Result<(), anyhow::Error> { + let mapper = build_mapper_38(true)?; + let hgvs_c = "NM_000059.3:c.7790delAAG"; + let var_c = HgvsVariant::from_str(hgvs_c)?; + + assert!(mapper.c_to_p(&var_c).is_err()); + + Ok(()) + } + + #[rstest] + #[case("NM_000059.3:c.7791A>G", "NP_000050.2:p.Lys2597=", "BRCA2", 38)] + #[case("NM_000302.3:c.1594_1596del", "NP_000293.2:p.Glu532del", "PLOD1", 38)] + #[case( + "NM_000090.3:c.2490_2516del", + "NP_000081.1:p.Glu832_Gly840del", + "COL3A1", + 38 + )] + #[case("NM_001292004.1:c.376=", "NC_000005.10:g.74715659=", "HEXB", 38)] + #[case( + "NM_000116.4:c.-8_-3inv6", + "NC_000023.11:g.154411836_154411841inv", + "TAZ", + 38 + )] + #[case( + "NM_000348.3:c.89_91inv3", + "NC_000002.12:g.31580810_31580812inv", + "SDR5A2", + 38 + )] + #[case("NM_001637.3:c.1582_1583inv", "NP_001628.1:p.Gly528Pro", "AOAH", 38)] + #[case("NM_025137.3:c.-20_*20inv", "NP_079413.3:p.?", "SPG11", 38)] + fn project_c_to_x( + #[case] hgvs_c: &str, + #[case] hgvs_x: &str, + #[case] gene: &str, + #[case] build: i32, + ) -> Result<(), anyhow::Error> { + let mapper = if build == 38 { + build_mapper_38(true)? + } else { + build_mapper_37(true)? + }; + + let var_c = HgvsVariant::from_str(hgvs_c)?; + let var_x = HgvsVariant::from_str(hgvs_x)?; + let actual = match &var_x { + HgvsVariant::GenomeVariant { .. } => mapper.c_to_g(&var_c)?, + HgvsVariant::ProtVariant { .. } => mapper.c_to_p(&var_c)?, + _ => panic!("not implemented"), + }; + + assert_eq!(format!("{}", &NoRef(&actual)), hgvs_x, "gene={}", gene); + + Ok(()) + } + + #[rstest] + #[case( + "NC_000019.10:g.50378563_50378564insTG", + "NM_007121.5:n.796_798delinsTG", + "NR1H2", + 38 + )] + #[case( + "NC_000007.14:g.149779575delC", + "NM_198455.2:n.1115_1116insAG", + "SSPO", + 38 + )] + #[case( + "NC_000012.11:g.122064775C>T", + "NM_032790.3:c.127_128insTGCCAC", + "ORAI1", + 38 + )] + #[case( + "NC_000002.11:g.73675227_73675228insCTC", + "NM_015120.4:c.1574_1576=", + "ALMS1", + 37 + )] + #[case("NM_000116.4:c.-120_-119insT", "NC_000023.10:g.153640061=", "TAZ", 37)] + #[case( + "NM_000348.3:c.88del", + "NC_000002.11:g.(31805882_31805883)=", + "SRD5A2", + 37 + )] + fn project_at_alignment_discrepancies( + #[case] hgvs_lhs: &str, + #[case] hgvs_rhs: &str, + #[case] gene: &str, + #[case] build: i32, + ) -> Result<(), anyhow::Error> { + let mapper = if build == 38 { + build_mapper_38(true)? + } else { + build_mapper_37(true)? + }; + + let var_lhs = HgvsVariant::from_str(hgvs_lhs)?; + let var_rhs = HgvsVariant::from_str(hgvs_rhs)?; + let actual = match (&var_lhs, &var_rhs) { + (HgvsVariant::CdsVariant { .. }, HgvsVariant::GenomeVariant { .. }) => { + mapper.c_to_g(&var_lhs)? + } + (HgvsVariant::GenomeVariant { .. }, HgvsVariant::CdsVariant { .. }) => { + mapper.g_to_c(&var_lhs, &var_rhs.accession())? + } + (HgvsVariant::GenomeVariant { .. }, HgvsVariant::TxVariant { .. }) => { + mapper.g_to_n(&var_lhs, &var_rhs.accession())? + } + _ => panic!("not implemented"), + }; + + assert_eq!( + format!("{}", &crate::parser::NoRef(&actual)), + hgvs_rhs, + "gene={}", + gene + ); + + Ok(()) + } + + #[rstest] + #[case( + "NM_080877.2:c.1733_1735delinsTTT", + "NP_543153.1:p.Pro578_Lys579delinsLeuTer", + "SLC34A3" + )] + #[case( + "NM_001034853.1:c.2847_2848delAGinsCT", + "NP_001030025.1:p.Glu949_Glu950delinsAspTer", + "RPGR" + )] + #[case( + "NM_001034853.1:c.2847_2848inv", + "NP_001030025.1:p.Glu949_Glu950delinsAspTer", + "RPGR" + )] + #[case("NM_080877.2:c.1735A>T", "NP_543153.1:p.Lys579Ter", "SLC34A3")] + #[case("NM_080877.2:c.1795_*3delinsTAG", "NP_543153.1:p.Leu599Ter", "SLC34A3")] + fn c_to_p_with_stop_gain( + #[case] hgvs_c: &str, + #[case] hgvs_p: &str, + #[case] gene: &str, + ) -> Result<(), anyhow::Error> { + let mapper = build_mapper_38(true)?; + let var_c = HgvsVariant::from_str(hgvs_c)?; + + let actual = mapper.c_to_p(&var_c)?; + + assert_eq!( + format!("{}", &crate::parser::NoRef(&actual)), + hgvs_p, + "gene={}", + gene + ); + + Ok(()) + } + } + + /// The following is a port of the `Test_RefReplacement` in + /// `test_hgvs_variantmapper_near_discrepancies.py` (sic!) + mod ref_replacement { + use std::str::FromStr; + + use rstest::rstest; + + use crate::parser::HgvsVariant; + + use super::build_mapper_38; + + // These casese attempt to test reference update in four dimensions: + // + // - variant type: n, c, g + // - major mapping paths: c<->n, c<->g, n<->g + // - variant class: sub, del, ins, delins, dup + // - strand: +/- + // + // ADRB2 │ NM_000024.5 │ 239 │ 1481 │ NC_000005.9 │ 1 │ 148206155,148208197 | 284=1X32=1X1724= + // + // cseq = hdp.fetch_seq("NM_000024.5") + // gseq = hdp.fetch_seq("NC_000005.9",148206155,148208197) + // cseq[280:290] = "CAATAGAAGC" + // gseq[280:290] = "CAATGGAAGC" + // ^ @ n.285 + #[rstest] + // These variants are in and around the first sub: + #[case( + "NM_000024.5:c.42C>N", + "NC_000005.9:g.148206436C>N", + "NM_000024.5:n.281C>N", + "ADRB2" + )] + #[case( + "NM_000024.5:c.43A>N", + "NC_000005.9:g.148206437A>N", + "NM_000024.5:n.282A>N", + "ADRB2" + )] + #[case( + "NM_000024.5:c.44A>N", + "NC_000005.9:g.148206438A>N", + "NM_000024.5:n.283A>N", + "ADRB2" + )] + #[case( + "NM_000024.5:c.45T>N", + "NC_000005.9:g.148206439T>N", + "NM_000024.5:n.284T>N", + "ADRB2" + )] + // ref repl + #[case( + "NM_000024.5:c.46A>N", + "NC_000005.9:g.148206440G>N", + "NM_000024.5:n.285A>N", + "ADRB2" + )] + #[case( + "NM_000024.5:c.47G>N", + "NC_000005.9:g.148206441G>N", + "NM_000024.5:n.286G>N", + "ADRB2" + )] + #[case( + "NM_000024.5:c.48A>N", + "NC_000005.9:g.148206442A>N", + "NM_000024.5:n.287A>N", + "ADRB2" + )] + #[case( + "NM_000024.5:c.49A>N", + "NC_000005.9:g.148206443A>N", + "NM_000024.5:n.288A>N", + "ADRB2" + )] + #[case( + "NM_000024.5:c.50G>N", + "NC_000005.9:g.148206444G>N", + "NM_000024.5:n.289G>N", + "ADRB2" + )] + #[case( + "NM_000024.5:c.51C>N", + "NC_000005.9:g.148206445C>N", + "NM_000024.5:n.281C>N", + "ADRB2" + )] + // ins, del, delins, dup + #[case( + "NM_000024.5:c.46_47insNN", + "NC_000005.9:g.148206440_148206441insNN", + "NM_000024.5:n.285_286insNN", + "ADRB2" + )] + #[case( + "NM_000024.5:c.45_47delTAG", + "NC_000005.9:g.148206439_148206441delTGG", + "NM_000024.5:n.284_286delTAG", + "ADRB2" + )] + #[case( + "NM_000024.5:c.45_47delTAGinsNNNN", + "NC_000005.9:g.148206439_148206441delTGGinsNNNN", + "NM_000024.5:n.284_286delTAGinsNNNN", + "ADRB2" + )] + #[case( + "NM_000024.5:c.45_47delTAGinsNNNN", + "NC_000005.9:g.148206439_148206441delTGGinsNNNN", + "NM_000024.5:n.284_286delTAGinsNNNN", + "ADRB2" + )] + #[case( + "NM_000024.5:c.46dupA", + "NC_000005.9:g.148206440dupG", + "NM_000024.5:n.285dupA", + "ADRB2" + )] + // IFNA16 │ NM_002173.2 │ 6 │ 576 │ NC_000009.11 │ -1 │ 21216371, 21217310 | 691=2X246= + // cseq = hdp.fetch_seq("NM_002173.2") + // gseq = reverse_complement(hdp.fetch_seq("NC_000009.11",21216371,21217310)) + // cseq[685:695] = "AAATTTCAAA" + // gseq[685:695] = "AAATTTTCAA" + // ^^ @ n.692_693 + // These variants are in and around the 2X substitution + #[case( + "NM_002173.2:c.*110A>N", + "NC_000009.11:g.21216625T>N", + "NM_002173.2:n.686A>N", + "IFNA16" + )] + #[case( + "NM_002173.2:c.*111A>N", + "NC_000009.11:g.21216624T>N", + "NM_002173.2:n.687A>N", + "IFNA16" + )] + #[case( + "NM_002173.2:c.*112A>N", + "NC_000009.11:g.21216623T>N", + "NM_002173.2:n.688A>N", + "IFNA16" + )] + #[case( + "NM_002173.2:c.*113T>N", + "NC_000009.11:g.21216622A>N", + "NM_002173.2:n.689T>N", + "IFNA16" + )] + #[case( + "NM_002173.2:c.*114T>N", + "NC_000009.11:g.21216621A>N", + "NM_002173.2:n.690T>N", + "IFNA16" + )] + #[case( + "NM_002173.2:c.*115T>N", + "NC_000009.11:g.21216620A>N", + "NM_002173.2:n.691T>N", + "IFNA16" + )] + // ref repl + #[case( + "NM_002173.2:c.*116C>N", + "NC_000009.11:g.21216619A>N", + "NM_002173.2:n.692C>N", + "IFNA16" + )] + // ref repl + #[case( + "NM_002173.2:c.*117A>N", + "NC_000009.11:g.21216618G>N", + "NM_002173.2:n.693A>N", + "IFNA16" + )] + #[case( + "NM_002173.2:c.*118A>N", + "NC_000009.11:g.21216617T>N", + "NM_002173.2:n.694A>N", + "IFNA16" + )] + #[case( + "NM_002173.2:c.*119A>N", + "NC_000009.11:g.21216616T>N", + "NM_002173.2:n.695A>N", + "IFNA16" + )] + // ins, del, delins, dup: + #[case( + "NM_002173.2:c.*115_*117insNN", + "NC_000009.11:g.21216618_21216620insNN", + "NM_002173.2:n.691_693insNN", + "IFNA16" + )] + #[case( + "NM_002173.2:c.*114_*117delTTCA", + "NC_000009.11:g.21216618_21216621delGAAA", + "NM_002173.2:n.690_693delTTCA", + "IFNA16" + )] + #[case( + "NM_002173.2:c.*115_*117delTCAinsNN", + "NC_000009.11:g.21216618_21216620delGAAinsNN", + "NM_002173.2:n.691_693delTCAinsNN", + "IFNA16" + )] + #[case( + "NM_002173.2:c.*115_*117delTCAinsNN", + "NC_000009.11:g.21216618_21216620delGAAinsNN", + "NM_002173.2:n.691_693delTCAinsNN", + "IFNA16" + )] + #[case( + "NM_002173.2:c.*115_*117dupTCA", + "NC_000009.11:g.21216618_21216620dupGAA", + "NM_002173.2:n.691_693dupTCA", + "IFNA16" + )] + // genomic variation within an indel discrepancy + // NM_032790.3 c 108 > + // NM_032790.3 n 301 > CGGGGAGCCCCCGGGGGCC------CCGCCACCGCCGCCGT + // |||||||||||||||||||------|||||||||||||||| + // NC_000012.11 g 122064755 > CGGGGAGCCCCCGGGGGCCCCGCCACCGCCACCGCCGCCGT + // ********** + #[case( + "NM_032790.3:c.125_128delCCCC", + "NC_000012.11:g.122064772_122064781delCCCCGCCACC", + "NM_032790.3:n.318_321delCCCC", + "ORAI1" + )] + fn cases( + #[case] hgvs_c: &str, + #[case] hgvs_g: &str, + #[case] hgvs_n: &str, + #[case] gene: &str, + ) -> Result<(), anyhow::Error> { + let mapper = build_mapper_38(true)?; + let var_c = HgvsVariant::from_str(hgvs_c)?; + let var_g = HgvsVariant::from_str(hgvs_g)?; + let var_n = HgvsVariant::from_str(hgvs_n)?; + + let var_c = var_c.with_reference("NNNNNN".to_string()); + let fixed_c = mapper.replace_reference(var_c)?; + assert_eq!(format!("{}", &fixed_c), hgvs_c, "gene={}", gene); + + let var_g = var_g.with_reference("NNNNNN".to_string()); + let fixed_g = mapper.replace_reference(var_g)?; + assert_eq!(format!("{}", &fixed_g), hgvs_g, "gene={}", gene); + + let var_n = var_n.with_reference("NNNNNN".to_string()); + let fixed_n = mapper.replace_reference(var_n)?; + assert_eq!(format!("{}", &fixed_n), hgvs_n, "gene={}", gene); + + Ok(()) + } + } + + /// The following is a port of `Test_AssemblyMapper` in + /// `test_hgvs_variantmapper_near_discrepancies.py` (sic!) + mod projections { + use std::str::FromStr; + + use crate::parser::HgvsVariant; + + use super::build_mapper_37; + + fn test_projections( + hgvs_g: &str, + hgvs_c: &str, + hgvs_n: &str, + hgvs_p: &str, + ) -> Result<(), anyhow::Error> { + let mapper = build_mapper_37(false)?; + + let var_g = HgvsVariant::from_str(hgvs_g)?; + let var_c = HgvsVariant::from_str(hgvs_c)?; + let var_n = HgvsVariant::from_str(hgvs_n)?; + + let res_cg = mapper.c_to_g(&var_c)?; + assert_eq!(format!("{}", res_cg), hgvs_g,); + + let res_gc = mapper.g_to_c(&var_g, &var_c.accession())?; + assert_eq!(format!("{}", res_gc), hgvs_c,); + + let res_ng = mapper.n_to_g(&var_n)?; + assert_eq!(format!("{}", res_ng), hgvs_g,); + + let res_gn = mapper.g_to_n(&var_g, &var_n.accession())?; + assert_eq!(format!("{}", res_gn), hgvs_n,); + + let res_cn = mapper.c_to_n(&var_c)?; + assert_eq!(format!("{}", res_cn), hgvs_n,); + + let res_nc = mapper.n_to_c(&var_n)?; + assert_eq!(format!("{}", res_nc), hgvs_c,); + + let res_cp = mapper.c_to_p(&var_c)?; + assert_eq!(format!("{}", res_cp), hgvs_p,); + + Ok(()) + } + + #[test] + fn snv() -> Result<(), anyhow::Error> { + test_projections( + "NC_000007.13:g.36561662C>T", + "NM_001637.3:c.1582G>A", + "NM_001637.3:n.1983G>A", + "NP_001628.1:p.Gly528Arg", + ) + } + + #[test] + fn intronic() -> Result<(), anyhow::Error> { + test_projections( + "NC_000010.10:g.89711873A>C", + "NM_000314.4:c.493-2A>C", + "NM_000314.4:n.1524-2A>C", + "NP_000305.3:p.?", + ) + } + } + + /// The following is a port of the `test_hgvs_variantmapper_near_discrepancies.py` (sic!) + mod near_discrepancies { + use std::str::FromStr; + + use crate::parser::{HgvsVariant, NoRef}; + + use super::build_mapper_38; + + #[test] + fn run() -> Result<(), anyhow::Error> { + let mapper = build_mapper_38(false)?; + let mut rdr = csv::ReaderBuilder::new() + .delimiter(b'\t') + .has_headers(false) + .comment(Some(b'#')) + .from_path("tests/data/mapper/proj-near-disc.tsv")?; + + for row in rdr.records() { + let row = row?; + let disc_type = row.get(0).unwrap(); + let loc_type = row.get(1).unwrap(); + let variant = row.get(2).unwrap(); + let expected = row.get(3).unwrap(); + + let var_n = HgvsVariant::from_str(variant)?; + let var_g = mapper.n_to_g(&var_n)?; + + let actual = format!("{}", &NoRef(&var_g)).replace(['(', ')'], ""); + + assert_eq!( + actual, expected, + "loc_type={loc_type} disc_type={disc_type}", + ) + } + + Ok(()) + } + } +} diff --git a/src/mapper/mod.rs b/src/mapper/mod.rs index bf95be7..51590e0 100644 --- a/src/mapper/mod.rs +++ b/src/mapper/mod.rs @@ -1,4 +1,5 @@ pub mod alignment; pub(crate) mod altseq; +pub mod assembly; pub mod cigar; pub mod variant; diff --git a/src/mapper/variant.rs b/src/mapper/variant.rs index 4681866..7fc9172 100644 --- a/src/mapper/variant.rs +++ b/src/mapper/variant.rs @@ -124,6 +124,16 @@ impl Mapper { ) } + /// Construct a new normalizer for the variant mapper. + pub fn normalizer(&self) -> Result<Normalizer, anyhow::Error> { + Ok(Normalizer::new( + self, + self.provider.clone(), + self.validator.clone(), + Default::default(), + )) + } + /// Convert from genome (g.) variant to transcript variant (g. or n.). /// /// # Args @@ -175,13 +185,7 @@ impl Mapper { && !mapper.is_g_interval_in_bounds(loc_edit.loc.inner()) { info!("Renormalizing out-of-bounds minus strand variant on genomic sequenc"); - Normalizer::new( - self, - self.provider.clone(), - self.validator.clone(), - Default::default(), - ) - .normalize(&var_g)? + self.normalizer()?.normalize(&var_g)? } else { var_g.clone() }; @@ -193,7 +197,7 @@ impl Mapper { ); let (pos_n, edit_n) = if let Mu::Certain(pos_n) = pos_n { let edit_n = self.convert_edit_check_strand(mapper.strand, &loc_edit.edit)?; - if let NaEdit::Ins { .. } = edit_n.inner() { + if let NaEdit::Ins { alternative } = edit_n.inner() { if pos_n.start.offset.is_none() && pos_n.end.offset.is_none() && pos_n.end.base - pos_n.start.base > 1 @@ -209,8 +213,9 @@ impl Mapper { ..pos_n.end }, }), - Mu::Certain(NaEdit::Ins { - alternative: "".to_string(), + Mu::Certain(NaEdit::RefAlt { + reference: "".to_string(), + alternative: alternative.clone(), }), ) } else { @@ -229,7 +234,7 @@ impl Mapper { &var_g, )?, }; - (Mu::Uncertain((*pos_n.inner()).clone()), Mu::Certain(edit_n)) + (Mu::Certain((*pos_n.inner()).clone()), Mu::Certain(edit_n)) }; // the following is not needed? @@ -288,7 +293,7 @@ impl Mapper { let (pos_g, edit_g) = if let Mu::Certain(pos_g) = pos_g { let edit_g = self.convert_edit_check_strand(mapper.strand, &loc_edit.edit)?; - if let (NaEdit::Ins { .. }, Some(end), Some(start)) = + if let (NaEdit::Ins { alternative }, Some(end), Some(start)) = (edit_g.inner(), pos_g.end, pos_g.start) { if end - start > 1 { @@ -298,8 +303,9 @@ impl Mapper { end: Some(end - 1), }), Mu::from( - NaEdit::Ins { - alternative: "".to_string(), + NaEdit::RefAlt { + reference: "".to_string(), + alternative: alternative.to_owned(), }, edit_g.is_certain(), ), @@ -379,7 +385,7 @@ impl Mapper { let (pos_c, edit_c) = if let Mu::Certain(pos_c) = pos_c { let edit_c = self.convert_edit_check_strand(mapper.strand, &loc_edit.edit)?; - if let NaEdit::Ins { .. } = edit_c.inner() { + if let NaEdit::Ins { alternative } = edit_c.inner() { if pos_c.start.offset.is_none() && pos_c.end.offset.is_none() && pos_c.end.base - pos_c.start.base > 1 @@ -395,8 +401,9 @@ impl Mapper { ..pos_c.end }, }), - Mu::Certain(NaEdit::Ins { - alternative: "".to_string(), + Mu::Certain(NaEdit::RefAlt { + reference: "".to_string(), + alternative: alternative.clone(), }), ) } else { @@ -415,7 +422,7 @@ impl Mapper { &var_g, )?, }; - (Mu::Uncertain((*pos_c.inner()).clone()), Mu::Certain(edit_c)) + (Mu::Certain((*pos_c.inner()).clone()), Mu::Certain(edit_c)) }; let var_c = HgvsVariant::CdsVariant { @@ -468,7 +475,7 @@ impl Mapper { let (pos_g, edit_g) = if let Mu::Certain(pos_g) = pos_g { let edit_g = self.convert_edit_check_strand(mapper.strand, &loc_edit.edit)?; - if let (NaEdit::Ins { .. }, Some(end), Some(start)) = + if let (NaEdit::Ins { alternative }, Some(end), Some(start)) = (edit_g.inner(), pos_g.end, pos_g.start) { if end - start > 1 { @@ -478,8 +485,9 @@ impl Mapper { end: Some(end - 1), }), Mu::from( - NaEdit::Ins { - alternative: "".to_string(), + NaEdit::RefAlt { + reference: "".to_string(), + alternative: alternative.clone(), }, edit_g.is_certain(), ), @@ -493,12 +501,20 @@ impl Mapper { } else { // variant at alignment gap let pos_n = mapper.g_to_n(pos_g.inner())?; + let var_n = HgvsVariant::TxVariant { + accession: var_c.accession().clone(), + gene_symbol: var_c.gene_symbol().clone(), + loc_edit: TxLocEdit { + loc: pos_n.clone(), + edit: Mu::Certain(var_c.na_edit().unwrap().clone()), + }, + }; let edit_n = NaEdit::RefAlt { reference: "".to_string(), alternative: self.get_altered_sequence( mapper.strand, pos_n.inner().clone().into(), - &var_c, + &var_n, )?, }; (pos_g, Mu::Certain(edit_n)) @@ -738,7 +754,9 @@ impl Mapper { seq.replace_range(r, alternative) } NaEdit::DelRef { .. } | NaEdit::DelNum { .. } => seq.replace_range(r, ""), - NaEdit::Ins { alternative } => seq.replace_range(r, alternative), + NaEdit::Ins { alternative } => { + seq.replace_range((r.start + 1)..(r.start + 1), alternative) + } NaEdit::Dup { .. } => { let seg = seq[r.clone()].to_string(); seq.replace_range(r.end..r.end, &seg); @@ -793,7 +811,7 @@ impl Mapper { } /// Fetch reference sequence for variant and return updated `HgvsVariant` if necessary. - pub(crate) fn replace_reference(&self, var: HgvsVariant) -> Result<HgvsVariant, anyhow::Error> { + pub fn replace_reference(&self, var: HgvsVariant) -> Result<HgvsVariant, anyhow::Error> { match &var { HgvsVariant::ProtVariant { .. } => Err(anyhow::anyhow!( "Can only update reference for c, g, m, n, r" @@ -1638,7 +1656,6 @@ mod test { let records = gcp_tests::load_records(&path)?; for record in records { - println!("-- {}", &record.id); let var_c = HgvsVariant::from_str(&record.hgvs_c)?; let prot_ac = record .hgvs_p diff --git a/src/parser/display.rs b/src/parser/display.rs index 4f3cb20..b1117fb 100644 --- a/src/parser/display.rs +++ b/src/parser/display.rs @@ -63,7 +63,13 @@ impl Display for NaEdit { alternative, } => match (reference.len(), alternative.len()) { (0, 0) => write!(f, "="), - (1, 1) => write!(f, "{reference}>{alternative}"), + (1, 1) => { + if reference == alternative { + write!(f, "=") + } else { + write!(f, "{reference}>{alternative}") + } + } (0, _) => write!(f, "delins{alternative}"), (_, 0) => write!(f, "del{reference}ins"), (_, _) => { @@ -98,7 +104,13 @@ impl<'a> Display for NoRef<'a, NaEdit> { alternative, }) => match (reference.len(), alternative.len()) { (0, 0) => write!(f, "="), - (1, 1) => write!(f, "{reference}>{alternative}"), + (1, 1) => { + if reference == alternative { + write!(f, "=") + } else { + write!(f, "{reference}>{alternative}") + } + } (_, 0) => write!(f, "delins"), (_, _) => { if reference == alternative { diff --git a/src/parser/ds.rs b/src/parser/ds.rs index 4d4b022..80bbb29 100644 --- a/src/parser/ds.rs +++ b/src/parser/ds.rs @@ -159,8 +159,9 @@ impl NaEdit { } => { if old_reference.is_empty() && alternative.is_empty() { NaEdit::RefAlt { - alternative: reference.clone(), - reference, + // sic! + alternative: reference.clone(), // sic! + reference, // sic! } } else { NaEdit::RefAlt { @@ -433,6 +434,18 @@ impl HgvsVariant { } } + /// Return the gene symbol. + pub fn gene_symbol(&self) -> &Option<GeneSymbol> { + match self { + HgvsVariant::CdsVariant { gene_symbol, .. } => gene_symbol, + HgvsVariant::GenomeVariant { gene_symbol, .. } => gene_symbol, + HgvsVariant::MtVariant { gene_symbol, .. } => gene_symbol, + HgvsVariant::TxVariant { gene_symbol, .. } => gene_symbol, + HgvsVariant::ProtVariant { gene_symbol, .. } => gene_symbol, + HgvsVariant::RnaVariant { gene_symbol, .. } => gene_symbol, + } + } + /// Return the accession. pub fn accession(&self) -> &Accession { match self { diff --git a/src/static_data/mod.rs b/src/static_data/mod.rs index e6a5872..7ea5348 100644 --- a/src/static_data/mod.rs +++ b/src/static_data/mod.rs @@ -10,7 +10,7 @@ const GRCH37_JSON_GZ: &[u8] = include_bytes!("_data/GRCh37.json.gz"); const GRCH37_P10_JSON_GZ: &[u8] = include_bytes!("_data/GRCh37.p10.json.gz"); const GRCH38_JSON_GZ: &[u8] = include_bytes!("_data/GRCh38.json.gz"); -#[derive(Debug, Deserialize, Enum)] +#[derive(Debug, Deserialize, Enum, Clone, Copy)] pub enum Assembly { Grch37, Grch37p10, diff --git a/tests/data/data/bootstrap.sh b/tests/data/data/bootstrap.sh index 14be387..ffbab02 100644 --- a/tests/data/data/bootstrap.sh +++ b/tests/data/data/bootstrap.sh @@ -65,10 +65,12 @@ read -r -d '' GENES <<EOF AADACL3 ADGRL3 ADRA2B +ADRB2 AGBL5 ALG9 AOAH ASB18 +ALMS1 ATM BAHCC1 BOLA3 @@ -95,7 +97,9 @@ GTF3C2 HIST3H2A H2AW HELQ +HEXB HMGA1 +IFNA16 IRAK3 JRK KCNIP4 @@ -110,18 +114,22 @@ MED21 MLH1 MSH6 MYH7 +NR1H2 NEFL OMA1 OPA1 OR9A4 ORAI1 +PLOD1 POMC PTEN PTH2 RABGAP1L +RBM38 RET RGL3 RMRP +RPGR RPL17 RPL38 RPL41 @@ -129,14 +137,19 @@ RYR1 RYR2 SCN5A SDHC +SDR5A2 SELENON SELL SERPINC1 SIL1 SLC22A5 +SLC34A3 +SPG11 SRA1 SRD5A2 +SSPO SSTR3 +TAZ TERC TPM3 TSC2 diff --git a/tests/data/data/uta_20210129-subset.pgd.gz b/tests/data/data/uta_20210129-subset.pgd.gz index 5d24672..bb5b891 100644 --- a/tests/data/data/uta_20210129-subset.pgd.gz +++ b/tests/data/data/uta_20210129-subset.pgd.gz @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2df85a5a42025c15ef2d370afb50ca9c0ce92f61c08696a8ab9a19ea93742ca6 -size 1738005 +oid sha256:280b647608377b041e4bf084159e3903a94d957704c1e9b0381cd3ad58127595 +size 1945350 diff --git a/tests/data/mapper/proj-near-disc.tsv b/tests/data/mapper/proj-near-disc.tsv new file mode 100644 index 0000000..098cfa4 --- /dev/null +++ b/tests/data/mapper/proj-near-disc.tsv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cf77bc78753fb7f905df12be763a5a3c8293a75242030e52b2110cd28b3e340 +size 3330 diff --git a/tests/data/mapper/sanity_cp.tsv b/tests/data/mapper/sanity_cp.tsv index 81fc9de..7c29dd6 100644 --- a/tests/data/mapper/sanity_cp.tsv +++ b/tests/data/mapper/sanity_cp.tsv @@ -1,10 +1,3 @@ -accession transcript_sequence cds_start_i cds_end_i -NM_999999.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGGGG 9 39 -NM_999998.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGCGAAATAGAGGAGGTAGTTTCGC 9 39 -NM_999997.1 AAAATCAAAATGAAAGCGAAAGCGTTTCGCGTAGAATAGAGGAGGCAGTTTCGC 9 39 -NM_999996.1 AAAATCAAAATGAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 39 -NM_999995.1 AAAATCAAAATGAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 42 -NM_999994.1 AAAATCAAAATGAAAAAAAAATCGAAAGCGTTTCGCGCGAAATAGAGGAGGCAGTTTCGC 9 45 -NM_999993.1 AAAATCAAAATGGGGAGGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 -NM_999992.1 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGC 9 39 -NM_999992.2 AAAATCAAAATGGGGTAGGCCCGGCAGCCAGCTTTATAGAGGAGGCAGTTTCGCC 9 40 +version https://git-lfs.github.com/spec/v1 +oid sha256:ea13fbdb42d2fa7736081fcd7de378425de31f02cdb92b749512b288a8d89760 +size 698 diff --git a/tests/data/seqrepo_cache.fasta b/tests/data/seqrepo_cache.fasta index 41a00e6..128c3cd 100644 --- a/tests/data/seqrepo_cache.fasta +++ b/tests/data/seqrepo_cache.fasta @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5fb54507ea15a1e9c921982f82341a38603564dd340e0040dd2a3c38490d4552 -size 594704 +oid sha256:327c981c959dfb41c3b63fe43aaa1469da563794da412a739d54bceec7a832c9 +size 643034