Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: integrate gnomAD v4 gene constraints (#367) #370

Merged
merged 2 commits into from
Dec 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions src/genes/cli/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1171,7 +1171,7 @@ pub mod gnomad_constraints {
// If parsed as T or None, return that.
MaybeNA::Value(value) => Ok(value),

// Otherwise, if value is string an "n/a", return None (and fail if it is any other
// Otherwise, if value is string an "NA", return None (and fail if it is any other
// string)
MaybeNA::NAString(string) => {
if string == "NA" {
Expand Down Expand Up @@ -2162,9 +2162,16 @@ mod tests {
Ok(())
}

#[test]
fn deserialize_gnomad_constraints() -> Result<(), anyhow::Error> {
let path_tsv = "tests/genes/gnomad_constraints/gnomad_constraints.tsv";
#[rstest::rstest]
#[case::gnomad_v2("2.1")]
#[case::gnomad_v4("4.0")]
fn deserialize_gnomad_constraints(
#[case] gnomad_constraints_version: &str,
) -> Result<(), anyhow::Error> {
crate::common::set_snapshot_suffix!("{}", &gnomad_constraints_version);
let path_tsv = format!(
"tests/genes/gnomad_constraints/v{gnomad_constraints_version}/gnomad_constraints.tsv",
);
let str_tsv = std::fs::read_to_string(path_tsv)?;
let mut rdr = csv::ReaderBuilder::new()
.delimiter(b'\t')
Expand Down
15 changes: 10 additions & 5 deletions src/genes/cli/import.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,10 @@ fn load_gnomad_constraints(
info!(" loading gnomAD constraints from {}", path);
let mut result = HashMap::new();

let mut reader = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
let mut reader = csv::ReaderBuilder::new()
.delimiter(b'\t')
.flexible(true)
.from_path(path)?;
for record in reader.deserialize::<gnomad_constraints::Record>() {
let record = record?;
result.insert(record.ensembl_gene_id.clone(), record);
Expand Down Expand Up @@ -1145,8 +1148,10 @@ pub mod test {
use clap_verbosity_flag::Verbosity;
use temp_testdir::TempDir;

#[test]
fn smoke_test() -> Result<(), anyhow::Error> {
#[rstest::rstest]
#[case::gnomad_v2("2.1")]
#[case::gnomad_v4("4.0")]
fn smoke_test(#[case] gnomad_constraints_version: &str) -> Result<(), anyhow::Error> {
let tmp_dir = TempDir::default();
let common_args = common::cli::Args {
verbose: Verbosity::new(1, 0),
Expand All @@ -1159,8 +1164,8 @@ pub mod test {
path_in_clingen_38: String::from(
"tests/genes/clingen/ClinGen_gene_curation_list_GRCh38.tsv",
),
path_in_gnomad_constraints: String::from(
"tests/genes/gnomad_constraints/gnomad_constraints.tsv",
path_in_gnomad_constraints: format!(
"tests/genes/gnomad_constraints/v{gnomad_constraints_version}/gnomad_constraints.tsv",
),
path_in_dbnsfp: String::from("tests/genes/dbnsfp/genes.tsv"),
path_in_hgnc: String::from("tests/genes/hgnc/hgnc_info.jsonl"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
---
source: src/genes/cli/data.rs
expression: records
---
- ensembl_gene_id: ENSG00000121410
entrez_id: "1"
gene_symbol: A1BG
exp_lof: 43.008
exp_mis: 646.45
exp_syn: 295.68
mis_z: -0.86948
obs_lof: 45
obs_mis: 707
obs_syn: 316
oe_lof: 1.0463
oe_lof_lower: 0.823
oe_lof_upper: 1.342
oe_mis: 1.0937
oe_mis_lower: 1.027
oe_mis_upper: 1.164
oe_syn: 1.0687
oe_syn_lower: 0.974
oe_syn_upper: 1.173
pli: 0.00000000000000017129
syn_z: -0.64437
- ensembl_gene_id: ENSG00000148584
entrez_id: "29974"
gene_symbol: A1CF
exp_lof: 69.896
exp_mis: 744.8
exp_syn: 272.29
mis_z: 1.2281
obs_lof: 45
obs_mis: 653
obs_syn: 272
oe_lof: 0.64381
oe_lof_lower: 0.506
oe_lof_upper: 0.825
oe_mis: 0.87674
oe_mis_lower: 0.821
oe_mis_upper: 0.935
oe_syn: 0.99895
oe_syn_lower: 0.904
oe_syn_upper: 1.104
pli: 0.00000000074328
syn_z: 0.0094264
- ensembl_gene_id: ENSG00000175899
entrez_id: "2"
gene_symbol: A2M
exp_lof: 147.1
exp_mis: 1631.7
exp_syn: 624.65
mis_z: 2.7451
obs_lof: 95
obs_mis: 1328
obs_syn: 541
oe_lof: 0.64584
oe_lof_lower: 0.546
oe_lof_upper: 0.766
oe_mis: 0.81386
oe_mis_lower: 0.777
oe_mis_upper: 0.852
oe_syn: 0.86608
oe_syn_lower: 0.806
oe_syn_upper: 0.93
pli: 0.000000000000000000091331
syn_z: 1.8251
- ensembl_gene_id: ENSG00000166535
entrez_id: "144568"
gene_symbol: A2ML1
exp_lof: 175.89
exp_mis: 1784.1
exp_syn: 680.92
mis_z: 1.6427
obs_lof: 146
obs_mis: 1594
obs_syn: 636
oe_lof: 0.83121
oe_lof_lower: 0.725
oe_lof_upper: 0.954
oe_mis: 0.89347
oe_mis_lower: 0.857
oe_mis_upper: 0.931
oe_syn: 0.93402
oe_syn_lower: 0.874
oe_syn_upper: 0.997
pli: 0.00000000000000000000000000000000000000015734
syn_z: 0.93881
- ensembl_gene_id: ENSG00000184389
entrez_id: "127550"
gene_symbol: A3GALT2
exp_lof: 26.534
exp_mis: 381.4
exp_syn: 160.06
mis_z: -4.0864
obs_lof: 41
obs_mis: 600
obs_syn: 253
oe_lof: 1.0228
oe_lof_lower: 0.699
oe_lof_upper: 1.53
oe_mis: 1.5732
oe_mis_lower: 1.471
oe_mis_upper: 1.683
oe_syn: 1.5807
oe_syn_lower: 1.426
oe_syn_upper: 1.754
pli: 0.0000010075
syn_z: -4.0061
- ensembl_gene_id: ENSG00000128274
entrez_id: "53947"
gene_symbol: A4GALT
exp_lof: 21.691
exp_mis: 494.59
exp_syn: 225.34
mis_z: -0.28586
obs_lof: 22
obs_mis: 512
obs_syn: 257
oe_lof: 0.61691
oe_lof_lower: 0.303
oe_lof_upper: 1.394
oe_mis: 1.0352
oe_mis_lower: 0.962
oe_mis_upper: 1.114
oe_syn: 1.1405
oe_syn_lower: 1.029
oe_syn_upper: 1.265
pli: 0.075456
syn_z: -1.1499
- ensembl_gene_id: ENSG00000118017
entrez_id: "51146"
gene_symbol: A4GNT
exp_lof: 27.359
exp_mis: 436.48
exp_syn: 167.83
mis_z: 1.0569
obs_lof: 14
obs_mis: 376
obs_syn: 156
oe_lof: 0.70139
oe_lof_lower: 0.383
oe_lof_upper: 1.377
oe_mis: 0.86143
oe_mis_lower: 0.791
oe_mis_upper: 0.938
oe_syn: 0.92954
oe_syn_lower: 0.815
oe_syn_upper: 1.062
pli: 0.019099
syn_z: 0.49778
- ensembl_gene_id: ENSG00000094914
entrez_id: "8086"
gene_symbol: AAAS
exp_lof: 81.872
exp_mis: 722.62
exp_syn: 280.11
mis_z: 0.87764
obs_lof: 61
obs_mis: 658
obs_syn: 271
oe_lof: 0.74507
oe_lof_lower: 0.605
oe_lof_upper: 0.922
oe_mis: 0.91057
oe_mis_lower: 0.853
oe_mis_upper: 0.971
oe_syn: 0.96747
oe_syn_lower: 0.875
oe_syn_upper: 1.07
pli: 0.0000000000000008615
syn_z: 0.29692
- ensembl_gene_id: ENSG00000081760
entrez_id: "65985"
gene_symbol: AACS
exp_lof: 82.529
exp_mis: 896.85
exp_syn: 349.3
mis_z: 0.4736
obs_lof: 65
obs_mis: 858
obs_syn: 333
oe_lof: 0.7876
oe_lof_lower: 0.644
oe_lof_upper: 0.968
oe_mis: 0.95668
oe_mis_lower: 0.904
oe_mis_upper: 1.012
oe_syn: 0.95333
oe_syn_lower: 0.871
oe_syn_upper: 1.044
pli: 0.0000000000000000085337
syn_z: 0.4756

10 changes: 10 additions & 0 deletions tests/genes/gnomad_constraints/v4.0/gnomad_constraints.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
ensembl_gene_id entrez_id gene_symbol exp_lof exp_mis exp_syn mis_z obs_lof obs_mis obs_syn oe_lof oe_lof_lower oe_lof_upper oe_mis oe_mis_lower oe_mis_upper oe_syn oe_syn_lower oe_syn_upper pLI syn_z exac_pLI exac_obs_lof exac_exp_lof exac_oe_lof
ENSG00000121410 1 A1BG 4.3008e+01 6.4645e+02 2.9568e+02 -8.6948e-01 45 707 316 1.0463e+00 8.2300e-01 1.3420e+00 1.0937e+00 1.0270e+00 1.1640e+00 1.0687e+00 9.7400e-01 1.1730e+00 1.7129e-16 -6.4437e-01 NA NA NA NA
ENSG00000148584 29974 A1CF 6.9896e+01 7.4480e+02 2.7229e+02 1.2281e+00 45 653 272 6.4381e-01 5.0600e-01 8.2500e-01 8.7674e-01 8.2100e-01 9.3500e-01 9.9895e-01 9.0400e-01 1.1040e+00 7.4328e-10 9.4264e-03 NA NA NA NA
ENSG00000175899 2 A2M 1.4710e+02 1.6317e+03 6.2465e+02 2.7451e+00 95 1328 541 6.4584e-01 5.4600e-01 7.6600e-01 8.1386e-01 7.7700e-01 8.5200e-01 8.6608e-01 8.0600e-01 9.3000e-01 9.1331e-20 1.8251e+00 NA NA NA NA
ENSG00000166535 144568 A2ML1 1.7589e+02 1.7841e+03 6.8092e+02 1.6427e+00 146 1594 636 8.3121e-01 7.2500e-01 9.5400e-01 8.9347e-01 8.5700e-01 9.3100e-01 9.3402e-01 8.7400e-01 9.9700e-01 1.5734e-40 9.3881e-01 NA NA NA NA
ENSG00000184389 127550 A3GALT2 2.6534e+01 3.8140e+02 1.6006e+02 -4.0864e+00 41 600 253 1.0228e+00 6.9900e-01 1.5300e+00 1.5732e+00 1.4710e+00 1.6830e+00 1.5807e+00 1.4260e+00 1.7540e+00 1.0075e-06 -4.0061e+00 NA NA NA NA
ENSG00000128274 53947 A4GALT 2.1691e+01 4.9459e+02 2.2534e+02 -2.8586e-01 22 512 257 6.1691e-01 3.0300e-01 1.3940e+00 1.0352e+00 9.6200e-01 1.1140e+00 1.1405e+00 1.0290e+00 1.2650e+00 7.5456e-02 -1.1499e+00 NA NA NA NA
ENSG00000118017 51146 A4GNT 2.7359e+01 4.3648e+02 1.6783e+02 1.0569e+00 14 376 156 7.0139e-01 3.8300e-01 1.3770e+00 8.6143e-01 7.9100e-01 9.3800e-01 9.2954e-01 8.1500e-01 1.0620e+00 1.9099e-02 4.9778e-01 NA NA NA NA
ENSG00000094914 8086 AAAS 8.1872e+01 7.2262e+02 2.8011e+02 8.7764e-01 61 658 271 7.4507e-01 6.0500e-01 9.2200e-01 9.1057e-01 8.5300e-01 9.7100e-01 9.6747e-01 8.7500e-01 1.0700e+00 8.6150e-16 2.9692e-01 NA NA NA NA
ENSG00000081760 65985 AACS 8.2529e+01 8.9685e+02 3.4930e+02 4.7360e-01 65 858 333 7.8760e-01 6.4400e-01 9.6800e-01 9.5668e-01 9.0400e-01 1.0120e+00 9.5333e-01 8.7100e-01 1.0440e+00 8.5337e-18 4.7560e-01 NA NA NA NA
Loading