Skip to content

Commit

Permalink
feat: implement parsing of HGVS variants (#2) (#3)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe authored Feb 15, 2023
1 parent 9f09719 commit dbcfd05
Show file tree
Hide file tree
Showing 10 changed files with 4,114 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
/target
/Cargo.lock

*~
.*.sw?
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ version = "0.1.0"
edition = "2021"

[dependencies]
anyhow = "1.0.69"
nom = "7.1.3"
pretty_assertions = "1.3.0"
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[![CI](https://github.com/bihealth/hgvs-rs/actions/workflows/rust.yml/badge.svg)](https://github.com/bihealth/hgvs-rs/actions/workflows/rust.yml)
[![codecov](https://codecov.io/gh/bihealth/hgvs-rs/branch/main/graph/badge.svg?token=aZchhLWdzt)](https://codecov.io/gh/bihealth/hgvs-rs)

# hgvs-rs

This is a port of [biocommons/hgvs](https://github.com/biocommons/hgvs) to the Rust programming language.
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
pub mod parser;

pub fn add(left: usize, right: usize) -> usize {
left + right
}
Expand Down
291 changes: 291 additions & 0 deletions src/parser/ds.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
//! Data structures for representing HGVS variant descriptions.
/// Expression of "maybe uncertain".
#[derive(Clone, Debug, PartialEq)]
pub enum Mu<T> {
/// Certain variant of `T`.
Certain(T),
/// Uncertain variant of `T`.
Uncertain(T),
}

/// Representation of gene symbol, e.g., `TTN` or `Ttn`.
#[derive(Clone, Debug, PartialEq)]
pub struct GeneSymbol {
pub value: String,
}

/// Edit of nucleic acids.
#[derive(Clone, Debug, PartialEq)]
pub enum NaEdit {
/// A substitution where both reference and alternative allele are nucleic acid strings
/// (or empty).
RefAlt {
reference: String,
alternative: String,
},
/// A substitution where the reference is a number and alternative is a count.
NumAlt { count: i32, alternative: String },
/// Deletion of one or more nucleic acid characters.
Del { reference: String },
/// Insertion of one or more nucleic acid characters.
Ins { alternative: String },
/// Duplication of nucleic acid reference sequence.
Dup { reference: String },
/// Inversion of a (potentially empty) nucleic acid reference sequence.
InvRef { reference: String },
/// Inversion of a stretch given by its length.
InvNum { count: i32 },
}

/// Uncertain change through extension.
#[derive(Clone, Debug, PartialEq)]
pub enum UncertainLengthChange {
None,
Unknown,
Known(i32),
}

/// Representation of accession, e.g., `NM_01234.5`.
#[derive(Clone, Debug, PartialEq)]
pub struct Accession {
pub value: String,
}

/// Protein edit with interval end edit.
#[derive(Clone, Debug, PartialEq)]
pub enum ProteinEdit {
Fs {
alternative: Option<String>,
terminal: Option<String>,
length: UncertainLengthChange,
},
Ext {
/// Amino acid before "ext"
aa_ext: Option<String>,
/// Amino acid after "ext", terminal if shift is positive.
ext_aa: Option<String>,
/// Change in protein length.
change: UncertainLengthChange,
},
Subst {
alternative: String,
},
/// `delins`
DelIns {
alternative: String,
},
/// `ins`
Ins {
alternative: String,
},
/// `del`
Del,
/// `dup`
Dup,
/// `=`
Ident,
}

/// A HGVS variant specification.
#[derive(Clone, Debug, PartialEq)]
pub enum HgvsVariant {
/// Variant specification with `c.` location.
CdsVariant {
accession: Accession,
gene_symbol: Option<GeneSymbol>,
loc_edit: CdsLocEdit,
},
/// Variant specification with `g.` location.
GenomeVariant {
accession: Accession,
gene_symbol: Option<GeneSymbol>,
loc_edit: GenomeLocEdit,
},
/// Variant specification with `m.` location.
MtVariant {
accession: Accession,
gene_symbol: Option<GeneSymbol>,
loc_edit: MtLocEdit,
},
/// Variant specification with `n.` location.
TxVariant {
accession: Accession,
gene_symbol: Option<GeneSymbol>,
loc_edit: TxLocEdit,
},
/// Variant specification with `p.` location.
ProtVariant {
accession: Accession,
gene_symbol: Option<GeneSymbol>,
loc_edit: ProtLocEdit,
},
/// Variant specification with `r.` location.
RnaVariant {
accession: Accession,
gene_symbol: Option<GeneSymbol>,
loc_edit: RnaLocEdit,
},
}

/// Coding sequence location with edit.
#[derive(Clone, Debug, PartialEq)]
pub struct CdsLocEdit {
/// Location on the CDS.
pub loc: Mu<CdsInterval>,
/// DNA change description.
pub edit: Mu<NaEdit>,
}

/// CDS position interval.
#[derive(Clone, Debug, PartialEq)]
pub struct CdsInterval {
/// Start position
pub begin: CdsPos,
/// End position
pub end: CdsPos,
}

/// Specifies whether the CDS position is relative to the CDS start or
/// CDS end.
#[derive(Clone, Debug, PartialEq)]
pub enum CdsFrom {
Start,
End,
}

/// CDS position.
#[derive(Clone, Debug, PartialEq)]
pub struct CdsPos {
/// Base position.
pub base: i32,
/// Optional offset.
pub offset: Option<i32>,
/// Whether starts at CDS start or end.
pub cds_from: CdsFrom,
}

/// Genome sequence location with edit.
#[derive(Clone, Debug, PartialEq)]
pub struct GenomeLocEdit {
/// Location on the genome.
pub loc: Mu<GenomeInterval>,
/// DNA change description.
pub edit: Mu<NaEdit>,
}

/// Genome position interval.
#[derive(Clone, Debug, PartialEq)]
pub struct GenomeInterval {
/// Start position
pub begin: Option<i32>,
/// End position
pub end: Option<i32>,
}

/// Mitochondrial sequence location with edit.
#[derive(Clone, Debug, PartialEq)]
pub struct MtLocEdit {
/// Location on the mitochondrium.
pub loc: Mu<MtInterval>,
/// DNA change description.
pub edit: Mu<NaEdit>,
}

/// Mitochondrial position interval.
#[derive(Clone, Debug, PartialEq)]
pub struct MtInterval {
/// Start position
pub begin: Option<i32>,
/// End position
pub end: Option<i32>,
}

/// Transcript sequence location with edit.
#[derive(Clone, Debug, PartialEq)]
pub struct TxLocEdit {
/// Loction on a transcript.
pub loc: Mu<TxInterval>,
/// DNA change description.
pub edit: Mu<NaEdit>,
}

/// Transcript position interval.
#[derive(Clone, Debug, PartialEq)]
pub struct TxInterval {
/// Start position
pub begin: TxPos,
/// End position
pub end: TxPos,
}

/// Transcript position.
#[derive(Clone, Debug, PartialEq)]
pub struct TxPos {
/// Base position.
pub base: i32,
/// Optional offset.
pub offset: Option<i32>,
}

/// RNA sequence location with edit.
#[derive(Clone, Debug, PartialEq)]
pub struct RnaLocEdit {
/// Location on a transcript.
pub loc: Mu<RnaInterval>,
/// RNA change description.
pub edit: Mu<NaEdit>,
}

/// RNA position interval.
#[derive(Clone, Debug, PartialEq)]
pub struct RnaInterval {
/// Start position
pub begin: RnaPos,
/// End position
pub end: RnaPos,
}

/// RNA position.
#[derive(Clone, Debug, PartialEq)]
pub struct RnaPos {
/// Base position.
pub base: i32,
/// Optional offset.
pub offset: Option<i32>,
}

/// Protein sequence location with edit or special.
#[derive(Clone, Debug, PartialEq)]
pub enum ProtLocEdit {
Ordinary {
loc: Mu<ProtInterval>,
edit: Mu<ProteinEdit>,
},
/// `=`
NoChange,
/// `(=)`
NoChangeUncertain,
/// `0`
NoProtein,
/// `0?`
NoProteinUncertain,
}

/// Protein position interval.
#[derive(Clone, Debug, PartialEq)]
pub struct ProtInterval {
/// Start position
pub begin: ProtPos,
/// End position
pub end: ProtPos,
}

/// Protein position.
#[derive(Clone, Debug, PartialEq)]
pub struct ProtPos {
/// Amino acid value.
pub aa: String,
/// Number of `aa`.
pub number: i32,
}
Loading

0 comments on commit dbcfd05

Please sign in to comment.