From de1e59a843eeca58efda8289415d5de9c0aa9d0b Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 26 Jan 2024 16:40:47 +0000 Subject: [PATCH 01/10] :hammer: Added import of custom desc from config :hammer: Add number field scrape --- src/commands/encoder_cmd.rs | 8 ++++++++ src/lib/echtvar.rs | 5 +++-- src/lib/fields.rs | 12 ++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/commands/encoder_cmd.rs b/src/commands/encoder_cmd.rs index 7081909..0420fe0 100644 --- a/src/commands/encoder_cmd.rs +++ b/src/commands/encoder_cmd.rs @@ -216,6 +216,14 @@ pub fn encoder_main(vpaths: Vec<&str>, opath: &str, jpath: &str) { "[echtvar] unsupported field type: {:?} for field {}", tt, f.field ), + }; + match _tl { + TagLength::Fixed(value) => f.number = format!("Number={}", value), + TagLength::AltAlleles => f.number = "Number=A".to_string(), + TagLength::Alleles => f.number = "Number=R".to_string(), + TagLength::Genotypes => f.number = "Number=G".to_string(), + TagLength::Variable => f.number = "Number=.".to_string(), + _ => println!("Kill me"), } } diff --git a/src/lib/echtvar.rs b/src/lib/echtvar.rs index cce4d7c..8b1b3b4 100644 --- a/src/lib/echtvar.rs +++ b/src/lib/echtvar.rs @@ -177,8 +177,9 @@ impl EchtVars { for e in &self.fields { header.push_record( format!( - "##INFO=", + "##INFO=", e.alias, + e.number, if e.ftype == fields::FieldType::Integer { "Integer" } else if e.ftype == fields::FieldType::Categorical { @@ -186,7 +187,7 @@ impl EchtVars { } else { "Float" }, - format!("added by echtvar from {}", path) + e.description ) .as_bytes(), ); diff --git a/src/lib/fields.rs b/src/lib/fields.rs index 7755dd9..7023feb 100644 --- a/src/lib/fields.rs +++ b/src/lib/fields.rs @@ -15,6 +15,8 @@ pub struct Field { pub missing_value: i32, #[serde(default = "default_missing_string")] pub missing_string: std::string::String, + #[serde(default = "default_description_string")] + pub description: std::string::String, #[serde(default)] pub zigzag: bool, @@ -23,6 +25,11 @@ pub struct Field { pub multiplier: u32, #[serde(default)] pub ftype: FieldType, + #[serde(default)] + pub number: std::string::String, + + + #[serde(default = "default_values_i", skip_serializing)] pub values_i: usize, } @@ -33,6 +40,9 @@ fn default_missing_value() -> i32 { fn default_missing_string() -> std::string::String { "MISSING".to_string() } +fn default_description_string() -> std::string::String { + "added by echtvar".to_string() +} fn default_multiplier() -> u32 { 1 } @@ -47,9 +57,11 @@ impl Default for Field { alias: "name".to_string(), missing_value: -1, missing_string: "MISSING".to_string(), + description: "added by echtvar".to_string(), zigzag: false, multiplier: 1, ftype: FieldType::Integer, + number: "Number=.".to_string(), values_i: usize::MAX, } } From 76e94a289454a624e373d1e16ce91f88fb86ac33 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 26 Jan 2024 18:23:20 +0000 Subject: [PATCH 02/10] :broom: minor cleanup --- src/commands/encoder_cmd.rs | 11 +++++------ src/lib/echtvar.rs | 2 +- src/lib/fields.rs | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/commands/encoder_cmd.rs b/src/commands/encoder_cmd.rs index 0420fe0..f5a9f60 100644 --- a/src/commands/encoder_cmd.rs +++ b/src/commands/encoder_cmd.rs @@ -218,12 +218,11 @@ pub fn encoder_main(vpaths: Vec<&str>, opath: &str, jpath: &str) { ), }; match _tl { - TagLength::Fixed(value) => f.number = format!("Number={}", value), - TagLength::AltAlleles => f.number = "Number=A".to_string(), - TagLength::Alleles => f.number = "Number=R".to_string(), - TagLength::Genotypes => f.number = "Number=G".to_string(), - TagLength::Variable => f.number = "Number=.".to_string(), - _ => println!("Kill me"), + TagLength::Fixed(value) => f.number = value.to_string(), + TagLength::AltAlleles => f.number = "A".to_string(), + TagLength::Alleles => f.number = "R".to_string(), + TagLength::Genotypes => f.number = "G".to_string(), + TagLength::Variable => f.number = ".".to_string(), } } diff --git a/src/lib/echtvar.rs b/src/lib/echtvar.rs index 8b1b3b4..ebb6b16 100644 --- a/src/lib/echtvar.rs +++ b/src/lib/echtvar.rs @@ -177,7 +177,7 @@ impl EchtVars { for e in &self.fields { header.push_record( format!( - "##INFO=", + "##INFO=", e.alias, e.number, if e.ftype == fields::FieldType::Integer { diff --git a/src/lib/fields.rs b/src/lib/fields.rs index 7023feb..d16b7c3 100644 --- a/src/lib/fields.rs +++ b/src/lib/fields.rs @@ -61,7 +61,7 @@ impl Default for Field { zigzag: false, multiplier: 1, ftype: FieldType::Integer, - number: "Number=.".to_string(), + number: ".".to_string(), values_i: usize::MAX, } } From 33cf624a60a89804bd55889f6d6ae9f300336ee3 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 26 Jan 2024 18:36:48 +0000 Subject: [PATCH 03/10] :pencil: add rc version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6a5445f..c61143d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "echtvar" -version = "0.1.9" +version = "0.2.0-rc" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html From bfef828738555d67ebba6e5efcbffaff58c0e183 Mon Sep 17 00:00:00 2001 From: Miller Date: Fri, 26 Jan 2024 14:20:49 -0500 Subject: [PATCH 04/10] :broom: catch impossible :broom: cleanup naming :pencil: add semicolon --- src/commands/encoder_cmd.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/commands/encoder_cmd.rs b/src/commands/encoder_cmd.rs index f5a9f60..ef1f723 100644 --- a/src/commands/encoder_cmd.rs +++ b/src/commands/encoder_cmd.rs @@ -187,7 +187,7 @@ pub fn encoder_main(vpaths: Vec<&str>, opath: &str, jpath: &str) { let mut lookups = HashMap::new(); for f in fields.iter_mut() { - let (tt, _tl) = if f.field == "FILTER" { + let (tt, tl) = if f.field == "FILTER" { (TagType::String, TagLength::Variable) } else { header @@ -217,13 +217,17 @@ pub fn encoder_main(vpaths: Vec<&str>, opath: &str, jpath: &str) { tt, f.field ), }; - match _tl { + match tl { TagLength::Fixed(value) => f.number = value.to_string(), TagLength::AltAlleles => f.number = "A".to_string(), TagLength::Alleles => f.number = "R".to_string(), TagLength::Genotypes => f.number = "G".to_string(), TagLength::Variable => f.number = ".".to_string(), - } + _ => panic!( + "[echtvar] unsupported field length: {:?} for field {}", + tl, f.field + ), + }; } let zfile = std::fs::File::create(&zpath).unwrap(); From 7352b3e5c52a3d32b25f69d359a3b1007d84be38 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Fri, 26 Jan 2024 16:06:23 -0500 Subject: [PATCH 05/10] :pencil: restored original default str to output for desc --- src/lib/echtvar.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/lib/echtvar.rs b/src/lib/echtvar.rs index ebb6b16..c75623c 100644 --- a/src/lib/echtvar.rs +++ b/src/lib/echtvar.rs @@ -187,7 +187,11 @@ impl EchtVars { } else { "Float" }, - e.description + if &e.description.to_string() == "added by echtvar"{ + format!("added by echtvar from {}", path) + } else { + e.description.to_string() + } ) .as_bytes(), ); From 580fb12830f55693dc0dd7d2165e00b9cfacaa91 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Sat, 27 Jan 2024 10:38:18 -0500 Subject: [PATCH 06/10] :pencil: added PR requested updates :construction: test in progress --- Cargo.toml | 2 +- src/lib/echtvar.rs | 2 +- tests/big.sh | 2 +- tests/test0.hjson | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index c61143d..6a5445f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "echtvar" -version = "0.2.0-rc" +version = "0.1.9" edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html diff --git a/src/lib/echtvar.rs b/src/lib/echtvar.rs index c75623c..38b3a6a 100644 --- a/src/lib/echtvar.rs +++ b/src/lib/echtvar.rs @@ -190,7 +190,7 @@ impl EchtVars { if &e.description.to_string() == "added by echtvar"{ format!("added by echtvar from {}", path) } else { - e.description.to_string() + format!("added by echtvar {}", e.description.to_string()) } ) .as_bytes(), diff --git a/tests/big.sh b/tests/big.sh index 332dcc9..e6b904e 100644 --- a/tests/big.sh +++ b/tests/big.sh @@ -61,7 +61,7 @@ for mod in 2 3 4 5; do done -rm generated-all.vcf generated-subset0.vcf anno.vcf.gz test.echtvar0 test.echtvar1 +# rm generated-all.vcf generated-subset0.vcf anno.vcf.gz test.echtvar0 test.echtvar1 bash string.sh echo "SUCCESS" diff --git a/tests/test0.hjson b/tests/test0.hjson index a941947..ae9d0b0 100644 --- a/tests/test0.hjson +++ b/tests/test0.hjson @@ -1,3 +1,3 @@ [ - {"field": "val0", "alias": "aval"} + {"field": "val0", "alias": "aval", "description": "TEST description field"} ] From 03efdecbd94af067453e078cddd20d13207180a1 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Mon, 29 Jan 2024 17:53:30 +0000 Subject: [PATCH 07/10] :white_check_mark: added test for poposed feature --- tests/big.sh | 6 +++++- tests/check-string-for-issue33.py | 26 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 tests/check-string-for-issue33.py diff --git a/tests/big.sh b/tests/big.sh index e6b904e..549e36d 100644 --- a/tests/big.sh +++ b/tests/big.sh @@ -16,6 +16,8 @@ for mod in 2 3 4 5; do # check that some variants remain unannotated python3 check.py anno.vcf.gz $mod + # check custom INFO Description used from config + python3 check-string-for-issue33.py anno.vcf.gz aval "added by echtvar TEST description field" if [[ "mod" -ne "1" ]]; then @@ -24,6 +26,8 @@ for mod in 2 3 4 5; do $echtvar encode test.echtvar1 test1.hjson generated-subset1.vcf $echtvar anno generated-all.vcf -e test.echtvar0 -e test.echtvar1 anno.vcf.gz python3 check.py anno.vcf.gz 1 + # check default Description used + python3 check-string-for-issue33.py anno.vcf.gz aval1 "added by echtvar from test.echtvar1" fi @@ -61,7 +65,7 @@ for mod in 2 3 4 5; do done -# rm generated-all.vcf generated-subset0.vcf anno.vcf.gz test.echtvar0 test.echtvar1 +rm generated-all.vcf generated-subset0.vcf anno.vcf.gz test.echtvar0 test.echtvar1 bash string.sh echo "SUCCESS" diff --git a/tests/check-string-for-issue33.py b/tests/check-string-for-issue33.py new file mode 100644 index 0000000..45a31d3 --- /dev/null +++ b/tests/check-string-for-issue33.py @@ -0,0 +1,26 @@ +""" +Small test to ensure expected INFO fields generated +""" +import sys +import gzip + +fpath = sys.argv[1] +field = sys.argv[2] +desc = sys.argv[3] + +ok = False +for line in gzip.open(sys.argv[1], 'rt'): + if line.startswith("##INFO= Date: Mon, 29 Jan 2024 19:19:18 +0000 Subject: [PATCH 08/10] :ring: feature proposal to add cmd line --- src/commands/annotate_cmd.rs | 1 + src/lib/echtvar.rs | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/src/commands/annotate_cmd.rs b/src/commands/annotate_cmd.rs index 7e4e6b1..3534706 100644 --- a/src/commands/annotate_cmd.rs +++ b/src/commands/annotate_cmd.rs @@ -34,6 +34,7 @@ pub fn annotate_main( e }) .collect(); + EchtVars::add_cmd_header(&mut header, &ipath, &opath, &include_expr, epaths); let parser = fasteval::Parser::new(); let mut slab = fasteval::Slab::new(); diff --git a/src/lib/echtvar.rs b/src/lib/echtvar.rs index 38b3a6a..617fb78 100644 --- a/src/lib/echtvar.rs +++ b/src/lib/echtvar.rs @@ -197,6 +197,17 @@ impl EchtVars { ); } } + pub fn add_cmd_header(header: &mut bcf::header::Header, vpath: &str, opath: &str, include_expr: &Option<&str>, epaths: Vec<&str>) { + header.push_record( + format!( + "##echtvar_anno_Command=anno -i {:?} {} {} -e {:?}", + include_expr, + vpath, + opath, + epaths + ).as_bytes(), + ); + } #[inline(always)] pub fn set_position( From 16628cd528a502173de35503b9544b69174ead39 Mon Sep 17 00:00:00 2001 From: Miguel Brown Date: Tue, 30 Jan 2024 21:51:47 +0000 Subject: [PATCH 09/10] :broom: cleanup epaths print :pencil: default to 1 when INFO is A R G :test_tube: updated tests --- src/lib/echtvar.rs | 14 +++++++++----- tests/big.sh | 8 ++++++-- tests/check-string-for-issue33.py | 9 ++++++--- tests/make-vcf.py | 9 +++++---- tests/test0.hjson | 3 ++- tests/test1.hjson | 3 ++- 6 files changed, 30 insertions(+), 16 deletions(-) diff --git a/src/lib/echtvar.rs b/src/lib/echtvar.rs index 617fb78..4536a34 100644 --- a/src/lib/echtvar.rs +++ b/src/lib/echtvar.rs @@ -179,7 +179,11 @@ impl EchtVars { format!( "##INFO=", e.alias, - e.number, + if vec!["A", "R", "G"].iter().any(|n| n == &e.number) { + "1" + } else { + &e.number + }, if e.ftype == fields::FieldType::Integer { "Integer" } else if e.ftype == fields::FieldType::Categorical { @@ -187,10 +191,10 @@ impl EchtVars { } else { "Float" }, - if &e.description.to_string() == "added by echtvar"{ + if &e.description.to_string() == "added by echtvar" { format!("added by echtvar from {}", path) } else { - format!("added by echtvar {}", e.description.to_string()) + e.description.to_string() } ) .as_bytes(), @@ -200,11 +204,11 @@ impl EchtVars { pub fn add_cmd_header(header: &mut bcf::header::Header, vpath: &str, opath: &str, include_expr: &Option<&str>, epaths: Vec<&str>) { header.push_record( format!( - "##echtvar_anno_Command=anno -i {:?} {} {} -e {:?}", + "##echtvar_annoCommand=anno -i {:?} {} {} -e {:?}", include_expr, vpath, opath, - epaths + epaths.join(" -e ") ).as_bytes(), ); } diff --git a/tests/big.sh b/tests/big.sh index 549e36d..84186a8 100644 --- a/tests/big.sh +++ b/tests/big.sh @@ -17,7 +17,9 @@ for mod in 2 3 4 5; do # check that some variants remain unannotated python3 check.py anno.vcf.gz $mod # check custom INFO Description used from config - python3 check-string-for-issue33.py anno.vcf.gz aval "added by echtvar TEST description field" + python3 check-string-for-issue33.py anno.vcf.gz aval 1 "TEST description field" + # check A/R/G converted to 1 for number + python3 check-string-for-issue33.py anno.vcf.gz external_AC 1 "Theoretical AC from another source" if [[ "mod" -ne "1" ]]; then @@ -27,7 +29,9 @@ for mod in 2 3 4 5; do $echtvar anno generated-all.vcf -e test.echtvar0 -e test.echtvar1 anno.vcf.gz python3 check.py anno.vcf.gz 1 # check default Description used - python3 check-string-for-issue33.py anno.vcf.gz aval1 "added by echtvar from test.echtvar1" + python3 check-string-for-issue33.py anno.vcf.gz aval1 1 "added by echtvar from test.echtvar1" + # check value . is left alone + python3 check-string-for-issue33.py anno.vcf.gz external_str . "added by echtvar from test.echtvar1" fi diff --git a/tests/check-string-for-issue33.py b/tests/check-string-for-issue33.py index 45a31d3..ac732d6 100644 --- a/tests/check-string-for-issue33.py +++ b/tests/check-string-for-issue33.py @@ -6,14 +6,17 @@ fpath = sys.argv[1] field = sys.argv[2] -desc = sys.argv[3] +num = sys.argv[3] +desc = sys.argv[4] ok = False for line in gzip.open(sys.argv[1], 'rt'): if line.startswith("##INFO= Date: Wed, 31 Jan 2024 13:58:35 +0000 Subject: [PATCH 10/10] :rewind: revert desc prepend tag --- src/lib/echtvar.rs | 2 +- tests/big.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lib/echtvar.rs b/src/lib/echtvar.rs index 4536a34..1e5bd00 100644 --- a/src/lib/echtvar.rs +++ b/src/lib/echtvar.rs @@ -194,7 +194,7 @@ impl EchtVars { if &e.description.to_string() == "added by echtvar" { format!("added by echtvar from {}", path) } else { - e.description.to_string() + format!("added by echtvar {}", e.description.to_string()) } ) .as_bytes(), diff --git a/tests/big.sh b/tests/big.sh index 84186a8..4139bae 100644 --- a/tests/big.sh +++ b/tests/big.sh @@ -17,9 +17,9 @@ for mod in 2 3 4 5; do # check that some variants remain unannotated python3 check.py anno.vcf.gz $mod # check custom INFO Description used from config - python3 check-string-for-issue33.py anno.vcf.gz aval 1 "TEST description field" + python3 check-string-for-issue33.py anno.vcf.gz aval 1 "added by echtvar TEST description field" # check A/R/G converted to 1 for number - python3 check-string-for-issue33.py anno.vcf.gz external_AC 1 "Theoretical AC from another source" + python3 check-string-for-issue33.py anno.vcf.gz external_AC 1 "added by echtvar Theoretical AC from another source" if [[ "mod" -ne "1" ]]; then