Skip to content

Commit

Permalink
feat: adding support for clinvar-sv data (#227) (#290)
Browse files Browse the repository at this point in the history
Can be imported into RocksDB from JSONL data.
  • Loading branch information
holtgrewe committed Nov 16, 2023
1 parent bac9645 commit 1837899
Show file tree
Hide file tree
Showing 51 changed files with 1,673 additions and 225 deletions.
409 changes: 404 additions & 5 deletions Cargo.lock

Large diffs are not rendered by default.

4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ path = "src/main.rs"
actix-web = "4.4.0"
anyhow = "1.0"
bgzip = "0.3"
bio = "1.4.0"
biocommons-bioutils = "0.1.0"
boolvec = "0.2"
byteorder = "1.4"
chrono = { version = "0.4", features = ["serde"] }
Expand Down Expand Up @@ -50,7 +52,7 @@ strum = { version = "0.25", features = ["strum_macros", "derive"] }
thiserror = "1.0"
tracing = "0.1"
tracing-subscriber = "0.3"
biocommons-bioutils = "0.1.0"
rustc-hash = "1.1.0"

[build-dependencies]
prost-build = "0.12"
Expand Down
2 changes: 2 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
fn main() {
println!("cargo:rerun-if-changed=src/proto/annonars/clinvar/v1/minimal.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/clinvar/v1/per_gene.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/clinvar/v1/sv.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/cons/v1/base.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/dbsnp/v1/base.proto");
println!("cargo:rerun-if-changed=src/proto/annonars/gene/v1/base.proto");
Expand Down Expand Up @@ -38,6 +39,7 @@ fn main() {
&[
"annonars/clinvar/v1/minimal.proto",
"annonars/clinvar/v1/per_gene.proto",
"annonars/clinvar/v1/sv.proto",
"annonars/cons/v1/base.proto",
"annonars/dbsnp/v1/base.proto",
"annonars/gene/v1/base.proto",
Expand Down
56 changes: 29 additions & 27 deletions src/clinvar_genes/cli/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,14 @@ use std::{collections::HashSet, io::BufRead, sync::Arc};
use clap::Parser;
use prost::Message;

use crate::{
clinvar_genes::{
self,
pbs::{
ClinicalSignificance, GeneFreqRecordCounts, GeneImpactRecordCounts,
GeneVariantsForRelease, ReferenceAssertion, ReviewStatus, SequenceVariant,
},
},
clinvar_minimal, common,
use crate::pbs::annonars::clinvar::v1::minimal::{
ClinicalSignificance, Record, ReferenceAssertion, ReviewStatus,
};
use crate::pbs::annonars::clinvar::v1::per_gene::{
ClinvarPerGeneRecord, CoarseClinicalSignificance, GeneFreqRecordCounts, GeneImpactRecordCounts,
GeneVariantsForRelease, Impact,
};
use crate::{clinvar_genes, clinvar_minimal, common};

/// Command line arguments for `tsv import` sub command.
#[derive(Parser, Debug, Clone)]
Expand Down Expand Up @@ -64,7 +62,7 @@ fn load_per_impact_jsonl(

let mut count_out = Vec::new();
for (impact, counts) in record.counts {
let impact: crate::clinvar_genes::pbs::Impact = impact.into();
let impact: Impact = impact.into();
count_out.push(GeneImpactRecordCounts {
impact: impact as i32,
counts,
Expand Down Expand Up @@ -99,8 +97,7 @@ fn load_per_frequency_jsonl(

let mut count_out = Vec::new();
for (clinsig, counts) in record.counts {
let coarse_clinsig: crate::clinvar_genes::pbs::CoarseClinicalSignificance =
clinsig.into();
let coarse_clinsig: CoarseClinicalSignificance = clinsig.into();
count_out.push(GeneFreqRecordCounts {
coarse_clinsig: coarse_clinsig as i32,
counts,
Expand All @@ -112,7 +109,7 @@ fn load_per_frequency_jsonl(
Ok(result)
}

type PerVcv = indexmap::IndexMap<String, SequenceVariant>;
type PerVcv = indexmap::IndexMap<String, Record>;
type PerAssembly = indexmap::IndexMap<String, PerVcv>;
type PerGene = indexmap::IndexMap<String, PerAssembly>;

Expand Down Expand Up @@ -150,33 +147,38 @@ fn load_variants_jsonl(
clinical_significance,
review_status,
sequence_location,
..
} = input_record;
let clinvar_minimal::cli::reading::SequenceLocation {
assembly,
chr,
start,
stop,
reference_allele_vcf,
alternate_allele_vcf,
..
} = sequence_location;

if let (Some(reference_allele_vcf), Some(alternate_allele_vcf)) =
(reference_allele_vcf, alternate_allele_vcf)
if let (
Some(start),
Some(stop),
Some(reference_allele_vcf),
Some(alternate_allele_vcf),
) = (start, stop, reference_allele_vcf, alternate_allele_vcf)
{
for hgnc_id in hgnc_ids {
let per_release = per_gene.entry(hgnc_id).or_default();
let per_vcv = per_release.entry(assembly.clone()).or_default();
let seqvar =
per_vcv
.entry(vcv.clone())
.or_insert_with(|| SequenceVariant {
chrom: chr.clone(),
pos: start,
reference: reference_allele_vcf.clone(),
alternative: alternate_allele_vcf.clone(),
vcv: vcv.clone(),
reference_assertions: vec![],
});
let seqvar = per_vcv.entry(vcv.clone()).or_insert_with(|| Record {
release: assembly.clone(),
start,
stop,
reference: reference_allele_vcf.clone(),
alternative: alternate_allele_vcf.clone(),
vcv: vcv.clone(),
reference_assertions: vec![],
chromosome: chr.clone(),
});
seqvar.reference_assertions.push(ReferenceAssertion {
rcv: rcv.clone(),
title: title.clone(),
Expand Down Expand Up @@ -254,7 +256,7 @@ fn jsonl_import(

// Read through all records and insert each into the database.
for hgnc_id in hgnc_ids.iter() {
let record = clinvar_genes::pbs::ClinvarPerGeneRecord {
let record = ClinvarPerGeneRecord {
per_impact_counts: counts_per_impact.get(hgnc_id).cloned().unwrap_or_default(),
per_freq_counts: counts_per_freq.get(hgnc_id).cloned().unwrap_or_default(),
variants: vars_per_gene.get(hgnc_id).cloned().unwrap_or_default(),
Expand Down
23 changes: 11 additions & 12 deletions src/clinvar_genes/cli/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@ use std::sync::Arc;

use prost::Message;

use crate::{clinvar_genes, common};
use crate::{common, pbs::annonars::clinvar::v1::per_gene::ClinvarPerGeneRecord};

/// Command line arguments for `clinvar-gene query` sub command.
#[derive(clap::Parser, Debug, Clone)]
#[command(about = "query gene information data from RocksDB", long_about = None)]
#[command(about = "query clinvar per-gene data from RocksDB", long_about = None)]
pub struct Args {
/// Path to RocksDB directory with data.
#[arg(long)]
Expand Down Expand Up @@ -63,7 +63,7 @@ pub fn open_rocksdb_from_args(
fn print_record(
out_writer: &mut Box<dyn std::io::Write>,
output_format: common::cli::OutputFormat,
value: &clinvar_genes::pbs::ClinvarPerGeneRecord,
value: &ClinvarPerGeneRecord,
) -> Result<(), anyhow::Error> {
match output_format {
common::cli::OutputFormat::Jsonl => {
Expand All @@ -79,20 +79,19 @@ pub fn query_for_gene(
hgnc_id: &str,
db: &rocksdb::DBWithThreadMode<rocksdb::MultiThreaded>,
cf_data: &Arc<rocksdb::BoundColumnFamily>,
) -> Result<Option<clinvar_genes::pbs::ClinvarPerGeneRecord>, anyhow::Error> {
) -> Result<Option<ClinvarPerGeneRecord>, anyhow::Error> {
let raw_value = db
.get_cf(cf_data, hgnc_id.as_bytes())
.map_err(|e| anyhow::anyhow!("error while querying for HGNC ID {}: {}", hgnc_id, e))?;
raw_value
.map(|raw_value| {
clinvar_genes::pbs::ClinvarPerGeneRecord::decode(&mut std::io::Cursor::new(&raw_value))
.map_err(|e| {
anyhow::anyhow!(
"error while decoding clinvar per gene record for HGNC ID {}: {}",
hgnc_id,
e
)
})
ClinvarPerGeneRecord::decode(&mut std::io::Cursor::new(&raw_value)).map_err(|e| {
anyhow::anyhow!(
"error while decoding clinvar per gene record for HGNC ID {}: {}",
hgnc_id,
e
)
})
})
.transpose()
}
Expand Down
58 changes: 37 additions & 21 deletions src/clinvar_genes/cli/reading.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,38 +55,52 @@ pub mod gene_impact {
UpstreamGeneVariant,
}

impl From<Impact> for crate::clinvar_genes::pbs::Impact {
impl From<Impact> for crate::pbs::annonars::clinvar::v1::per_gene::Impact {
fn from(val: Impact) -> Self {
match val {
Impact::ThreePrimeUtrVariant => {
crate::clinvar_genes::pbs::Impact::ThreePrimeUtrVariant
crate::pbs::annonars::clinvar::v1::per_gene::Impact::ThreePrimeUtrVariant
}
Impact::FivePrimeUtrVariant => {
crate::clinvar_genes::pbs::Impact::FivePrimeUtrVariant
crate::pbs::annonars::clinvar::v1::per_gene::Impact::FivePrimeUtrVariant
}
Impact::DownstreamGeneVariant => {
crate::clinvar_genes::pbs::Impact::DownstreamTranscriptVariant
crate::pbs::annonars::clinvar::v1::per_gene::Impact::DownstreamTranscriptVariant
}
Impact::FrameshiftVariant => {
crate::pbs::annonars::clinvar::v1::per_gene::Impact::FrameshiftVariant
}
Impact::InframeIndel => {
crate::pbs::annonars::clinvar::v1::per_gene::Impact::InframeIndel
}
Impact::StartLost => crate::pbs::annonars::clinvar::v1::per_gene::Impact::StartLost,
Impact::IntronVariant => {
crate::pbs::annonars::clinvar::v1::per_gene::Impact::IntronVariant
}
Impact::MissenseVariant => {
crate::pbs::annonars::clinvar::v1::per_gene::Impact::MissenseVariant
}
Impact::FrameshiftVariant => crate::clinvar_genes::pbs::Impact::FrameshiftVariant,
Impact::InframeIndel => crate::clinvar_genes::pbs::Impact::InframeIndel,
Impact::StartLost => crate::clinvar_genes::pbs::Impact::StartLost,
Impact::IntronVariant => crate::clinvar_genes::pbs::Impact::IntronVariant,
Impact::MissenseVariant => crate::clinvar_genes::pbs::Impact::MissenseVariant,
Impact::NonCodingTranscriptVariant => {
crate::clinvar_genes::pbs::Impact::NonCodingTranscriptVariant
crate::pbs::annonars::clinvar::v1::per_gene::Impact::NonCodingTranscriptVariant
}
Impact::StopGained => {
crate::pbs::annonars::clinvar::v1::per_gene::Impact::StopGained
}
Impact::StopGained => crate::clinvar_genes::pbs::Impact::StopGained,
Impact::NoSequenceAlteration => {
crate::clinvar_genes::pbs::Impact::NoSequenceAlteration
crate::pbs::annonars::clinvar::v1::per_gene::Impact::NoSequenceAlteration
}
Impact::SpliceAcceptorVariant => {
crate::clinvar_genes::pbs::Impact::SpliceAcceptorVariant
crate::pbs::annonars::clinvar::v1::per_gene::Impact::SpliceAcceptorVariant
}
Impact::SpliceDonorVariant => {
crate::pbs::annonars::clinvar::v1::per_gene::Impact::SpliceDonorVariant
}
Impact::StopLost => crate::pbs::annonars::clinvar::v1::per_gene::Impact::StopLost,
Impact::SyonymousVariant => {
crate::pbs::annonars::clinvar::v1::per_gene::Impact::SynonymousVariant
}
Impact::SpliceDonorVariant => crate::clinvar_genes::pbs::Impact::SpliceDonorVariant,
Impact::StopLost => crate::clinvar_genes::pbs::Impact::StopLost,
Impact::SyonymousVariant => crate::clinvar_genes::pbs::Impact::SynonymousVariant,
Impact::UpstreamGeneVariant => {
crate::clinvar_genes::pbs::Impact::UpstreamTranscriptVariant
crate::pbs::annonars::clinvar::v1::per_gene::Impact::UpstreamTranscriptVariant
}
}
}
Expand Down Expand Up @@ -138,17 +152,19 @@ pub mod counts_by_freq {
Pathogenic,
}

impl From<CoarseClinicalSignificance> for crate::clinvar_genes::pbs::CoarseClinicalSignificance {
impl From<CoarseClinicalSignificance>
for crate::pbs::annonars::clinvar::v1::per_gene::CoarseClinicalSignificance
{
fn from(val: CoarseClinicalSignificance) -> Self {
match val {
CoarseClinicalSignificance::Benign => {
crate::clinvar_genes::pbs::CoarseClinicalSignificance::CoarseBenign
crate::pbs::annonars::clinvar::v1::per_gene::CoarseClinicalSignificance::CoarseBenign
}
CoarseClinicalSignificance::Uncertain => {
crate::clinvar_genes::pbs::CoarseClinicalSignificance::CoarseUncertain
crate::pbs::annonars::clinvar::v1::per_gene::CoarseClinicalSignificance::CoarseUncertain
}
CoarseClinicalSignificance::Pathogenic => {
crate::clinvar_genes::pbs::CoarseClinicalSignificance::CoarsePathogenic
crate::pbs::annonars::clinvar::v1::per_gene::CoarseClinicalSignificance::CoarsePathogenic
}
}
}
Expand Down
1 change: 0 additions & 1 deletion src/clinvar_genes/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
//! Command line interface for detailed ClinVar per-gene data.

pub mod cli;
pub mod pbs;
3 changes: 0 additions & 3 deletions src/clinvar_genes/pbs.rs

This file was deleted.

20 changes: 12 additions & 8 deletions src/clinvar_minimal/cli/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use prost::Message;
use crate::{
clinvar_minimal,
common::{self, keys},
pbs::annonars::clinvar::v1::minimal::ReferenceAssertion,
};

/// Command line arguments for `clinvar-minimal import` sub command.
Expand Down Expand Up @@ -69,19 +70,21 @@ fn jsonl_import(
sequence_location,
..
} = record;
let clinical_significance: clinvar_minimal::pbs::ClinicalSignificance =
let clinical_significance: crate::pbs::annonars::clinvar::v1::minimal::ClinicalSignificance =
clinical_significance.into();
let review_status: clinvar_minimal::pbs::ReviewStatus = review_status.into();
let review_status: crate::pbs::annonars::clinvar::v1::minimal::ReviewStatus =
review_status.into();
let clinvar_minimal::cli::reading::SequenceLocation {
assembly,
chr,
start,
stop,
reference_allele_vcf,
alternate_allele_vcf,
..
} = sequence_location;
if let (Some(reference_allele_vcf), Some(alternate_allele_vcf)) =
(reference_allele_vcf, alternate_allele_vcf)
if let (Some(start), Some(stop), Some(reference_allele_vcf), Some(alternate_allele_vcf)) =
(start, stop, reference_allele_vcf, alternate_allele_vcf)
{
let var = keys::Var::from(
&chr,
Expand All @@ -101,9 +104,10 @@ fn jsonl_import(
}
Ok(data) => {
let record = if let Some(data) = data {
let mut record = clinvar_minimal::pbs::Record::decode(&data[..])?;
let mut record =
crate::pbs::annonars::clinvar::v1::minimal::Record::decode(&data[..])?;
record.reference_assertions.push(
clinvar_minimal::pbs::ReferenceAssertion {
crate::pbs::annonars::clinvar::v1::minimal::ReferenceAssertion {
rcv,
title,
clinical_significance: clinical_significance.into(),
Expand All @@ -115,15 +119,15 @@ fn jsonl_import(
.sort_by_key(|a| (a.clinical_significance, a.review_status));
record
} else {
clinvar_minimal::pbs::Record {
crate::pbs::annonars::clinvar::v1::minimal::Record {
release: assembly,
chromosome: chr,
start,
stop,
reference: reference_allele_vcf,
alternative: alternate_allele_vcf,
vcv,
reference_assertions: vec![clinvar_minimal::pbs::ReferenceAssertion {
reference_assertions: vec![ReferenceAssertion {
rcv,
title,
clinical_significance: clinical_significance.into(),
Expand Down
Loading

0 comments on commit 1837899

Please sign in to comment.