From c62092e8e35c414fe9858454bc8a91e026077e7f Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Fri, 4 Oct 2024 09:54:25 +0200 Subject: [PATCH 1/2] fix: expose shuffle_direction and window_size config options --- src/mapper/assembly.rs | 7 +++++++ src/mapper/variant.rs | 8 ++++++++ src/normalizer.rs | 5 +++-- src/validator/mod.rs | 2 ++ 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/mapper/assembly.rs b/src/mapper/assembly.rs index b1de930..c043de5 100644 --- a/src/mapper/assembly.rs +++ b/src/mapper/assembly.rs @@ -7,6 +7,7 @@ use std::sync::Arc; use crate::mapper::error::Error; use crate::mapper::variant; +use crate::normalizer::Direction; use crate::parser::HgvsVariant; use crate::{data::interface::Provider, validator::ValidationLevel}; use biocommons_bioutils::assemblies::Assembly; @@ -47,6 +48,8 @@ pub struct Config { /// Use the genome sequence in case of uncertain g-to-n projections. This /// can be switched off so genome sequence does not have to be available. pub genome_seq_available: bool, + pub shuffle_direction: Direction, + pub window_size: usize, } impl Default for Config { @@ -63,6 +66,8 @@ impl Default for Config { add_gene_symbol: false, renormalize_g: true, genome_seq_available: true, + shuffle_direction: Default::default(), + window_size: 20, } } } @@ -111,6 +116,8 @@ impl Mapper { strict_bounds: config.strict_bounds, renormalize_g: config.renormalize_g, genome_seq_available: config.genome_seq_available, + shuffle_direction: config.shuffle_direction, + window_size: config.window_size, }; let inner = variant::Mapper::new(&inner_config, provider.clone()); let asm_accessions = provider diff --git a/src/mapper/variant.rs b/src/mapper/variant.rs index a01ba6e..ac5cd4c 100644 --- a/src/mapper/variant.rs +++ b/src/mapper/variant.rs @@ -7,6 +7,8 @@ use cached::proc_macro::cached; use cached::SizedCache; use log::{debug, info}; +use super::alignment; +use crate::normalizer::Direction; use crate::{ data::interface::Provider, mapper::Error, @@ -37,6 +39,8 @@ pub struct Config { /// Use the genome sequence in case of uncertain g-to-n projections. This /// can be switched off so genome sequence does not have to be available. pub genome_seq_available: bool, + pub shuffle_direction: Direction, + pub window_size: usize, } impl Default for Config { @@ -49,6 +53,8 @@ impl Default for Config { strict_bounds: true, renormalize_g: true, genome_seq_available: true, + shuffle_direction: Default::default(), + window_size: 20, } } } @@ -151,6 +157,8 @@ impl Mapper { self.validator.clone(), normalizer::Config { replace_reference: self.config.replace_reference, + shuffle_direction: self.config.shuffle_direction, + window_size: self.config.window_size, ..Default::default() }, )) diff --git a/src/normalizer.rs b/src/normalizer.rs index 04d13a5..43c9e3c 100644 --- a/src/normalizer.rs +++ b/src/normalizer.rs @@ -52,8 +52,9 @@ mod error { } /// A direction with respect to a sequence. -#[derive(Debug, PartialEq, Eq, Clone, Copy)] +#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)] pub enum Direction { + #[default] ThreeToFive, FiveToThree, } @@ -77,7 +78,7 @@ impl Default for Config { Self { alt_aln_method: "splign".to_string(), cross_boundaries: false, - shuffle_direction: Direction::FiveToThree, + shuffle_direction: Default::default(), replace_reference: true, validate: true, window_size: 20, diff --git a/src/validator/mod.rs b/src/validator/mod.rs index 81e6a77..c73cf36 100644 --- a/src/validator/mod.rs +++ b/src/validator/mod.rs @@ -134,6 +134,8 @@ impl ExtrinsicValidator { strict_bounds: true, renormalize_g: false, genome_seq_available: true, + shuffle_direction: Default::default(), + window_size: 20, }; Self { strict, From ad91baf250a8d1438f56b1d632e1a4eeb7318afc Mon Sep 17 00:00:00 2001 From: Till Hartmann Date: Fri, 4 Oct 2024 09:54:56 +0200 Subject: [PATCH 2/2] fix: introduce clamping to mimick hgvs python behaviour --- src/mapper/altseq.rs | 15 +++++++++++++-- src/mapper/assembly.rs | 2 +- src/mapper/variant.rs | 21 +++++++++++++++++---- 3 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/mapper/altseq.rs b/src/mapper/altseq.rs index 7cd6caa..ee2db73 100644 --- a/src/mapper/altseq.rs +++ b/src/mapper/altseq.rs @@ -437,17 +437,28 @@ impl AltSeqBuilder { // Incorporate the variant into the sequence (depending on the type). let mut is_substitution = false; + let range = if end >= seq.len() && reference.is_some() { + log::warn!( + "Altered sequence range {:?} is incompatible with sequence length {:?}, clamping. Variant description is {}", + start..end, + seq.len(), + &self.var_c + ); + start..seq.len() + } else { + start..end + }; match (reference, alternative) { (Some(reference), Some(alternative)) => { // delins or SNP - seq.replace_range(start..end, &alternative); + seq.replace_range(range, &alternative); if reference.len() == 1 && alternative.len() == 1 { is_substitution = true; } } (Some(_reference), None) => { // deletion - seq.replace_range(start..end, ""); + seq.replace_range(range, ""); } (None, Some(alternative)) => { // insertion diff --git a/src/mapper/assembly.rs b/src/mapper/assembly.rs index c043de5..4887fc6 100644 --- a/src/mapper/assembly.rs +++ b/src/mapper/assembly.rs @@ -288,7 +288,7 @@ impl Mapper { /// Normalize variant if requested and ignore errors. This is better than checking whether /// the variant is intronic because future UTAs will support LRG, which will enable checking /// intronic variants. - fn maybe_normalize(&self, var: &HgvsVariant) -> Result { + pub fn maybe_normalize(&self, var: &HgvsVariant) -> Result { if self.config.normalize { let normalizer = self.inner.normalizer()?; normalizer.normalize(var).or_else(|_| { diff --git a/src/mapper/variant.rs b/src/mapper/variant.rs index ac5cd4c..49ada0b 100644 --- a/src/mapper/variant.rs +++ b/src/mapper/variant.rs @@ -21,8 +21,6 @@ use crate::{ validator::{ValidationLevel, Validator}, }; -use super::alignment; - /// Configuration for Mapper. /// /// Defaults are taken from `hgvs` Python library. @@ -263,7 +261,7 @@ impl Mapper { (Mu::Certain((*pos_n).clone()), edit_n) } } else { - // This is the how the original code handles uncertain positions. We will reach + // This is how the original code handles uncertain positions. We will reach // here if the position is uncertain and we have the genome sequence. let pos_g = mapper.n_to_g(pos_n)?; let edit_n = NaEdit::RefAlt { @@ -794,6 +792,17 @@ impl Mapper { .loc_range() .ok_or(Error::NoAlteredSequenceForMissingPositions)?; let r = ((r.start - interval.start) as usize)..((r.end - interval.start) as usize); + let r = if r.end >= seq.len() { + log::warn!( + "Altered sequence range {:?} is incompatible with sequence length {:?}, clamping. Variant description is {}", + r, + seq.len(), + &var + ); + r.start..seq.len() + } else { + r + }; let na_edit = var.na_edit().ok_or(Error::NaEditMissing)?; @@ -801,7 +810,11 @@ impl Mapper { NaEdit::RefAlt { alternative, .. } | NaEdit::NumAlt { alternative, .. } => { seq.replace_range(r, alternative) } - NaEdit::DelRef { .. } | NaEdit::DelNum { .. } => seq.replace_range(r, ""), + NaEdit::DelRef { .. } | NaEdit::DelNum { .. } => { + // FIXME the original code in python simply does `del seq[pos_start:pos_end]`, + // which does not error if `pos_end > len(seq)`. Check if this is intended or not. + seq.replace_range(r, "") + } NaEdit::Ins { alternative } => { seq.replace_range((r.start + 1)..(r.start + 1), alternative) }