Skip to content

Commit

Permalink
🔨 Scrape Number and Specify Description (#34)
Browse files Browse the repository at this point in the history
* 🔨 Added import of custom desc from config
🔨 Add number field scrape

* 🧹 minor cleanup

* 📝 add rc version

* 🧹 catch impossible

🧹 cleanup naming

📝 add semicolon

* 📝 restored original default str to output for desc

* 📝 added PR requested updates
🚧 test in progress

* ✅ added test for poposed feature

* 💍 feature proposal to add cmd line

* 🧹 cleanup epaths print
📝 default to 1 when INFO is A R G
🧪 updated tests

* ⏪ revert desc prepend tag

---------

Co-authored-by: Miller <millerd15@email.chop.edu>
  • Loading branch information
migbro and dmiller15 authored Jan 31, 2024
1 parent 3adb13f commit d4ab3a8
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 10 deletions.
1 change: 1 addition & 0 deletions src/commands/annotate_cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ pub fn annotate_main(
e
})
.collect();
EchtVars::add_cmd_header(&mut header, &ipath, &opath, &include_expr, epaths);

let parser = fasteval::Parser::new();
let mut slab = fasteval::Slab::new();
Expand Down
15 changes: 13 additions & 2 deletions src/commands/encoder_cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ pub fn encoder_main(vpaths: Vec<&str>, opath: &str, jpath: &str) {
let mut lookups = HashMap::new();

for f in fields.iter_mut() {
let (tt, _tl) = if f.field == "FILTER" {
let (tt, tl) = if f.field == "FILTER" {
(TagType::String, TagLength::Variable)
} else {
header
Expand Down Expand Up @@ -216,7 +216,18 @@ pub fn encoder_main(vpaths: Vec<&str>, opath: &str, jpath: &str) {
"[echtvar] unsupported field type: {:?} for field {}",
tt, f.field
),
}
};
match tl {
TagLength::Fixed(value) => f.number = value.to_string(),
TagLength::AltAlleles => f.number = "A".to_string(),
TagLength::Alleles => f.number = "R".to_string(),
TagLength::Genotypes => f.number = "G".to_string(),
TagLength::Variable => f.number = ".".to_string(),
_ => panic!(
"[echtvar] unsupported field length: {:?} for field {}",
tl, f.field
),
};
}

let zfile = std::fs::File::create(&zpath).unwrap();
Expand Down
24 changes: 22 additions & 2 deletions src/lib/echtvar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,21 +177,41 @@ impl EchtVars {
for e in &self.fields {
header.push_record(
format!(
"##INFO=<ID={},Number=1,Type={},Description=\"{}\">",
"##INFO=<ID={},Number={},Type={},Description=\"{}\">",
e.alias,
if vec!["A", "R", "G"].iter().any(|n| n == &e.number) {
"1"
} else {
&e.number
},
if e.ftype == fields::FieldType::Integer {
"Integer"
} else if e.ftype == fields::FieldType::Categorical {
"String"
} else {
"Float"
},
format!("added by echtvar from {}", path)
if &e.description.to_string() == "added by echtvar" {
format!("added by echtvar from {}", path)
} else {
format!("added by echtvar {}", e.description.to_string())
}
)
.as_bytes(),
);
}
}
pub fn add_cmd_header(header: &mut bcf::header::Header, vpath: &str, opath: &str, include_expr: &Option<&str>, epaths: Vec<&str>) {
header.push_record(
format!(
"##echtvar_annoCommand=anno -i {:?} {} {} -e {:?}",
include_expr,
vpath,
opath,
epaths.join(" -e ")
).as_bytes(),
);
}

#[inline(always)]
pub fn set_position(
Expand Down
12 changes: 12 additions & 0 deletions src/lib/fields.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ pub struct Field {
pub missing_value: i32,
#[serde(default = "default_missing_string")]
pub missing_string: std::string::String,
#[serde(default = "default_description_string")]
pub description: std::string::String,

#[serde(default)]
pub zigzag: bool,
Expand All @@ -23,6 +25,11 @@ pub struct Field {
pub multiplier: u32,
#[serde(default)]
pub ftype: FieldType,
#[serde(default)]
pub number: std::string::String,



#[serde(default = "default_values_i", skip_serializing)]
pub values_i: usize,
}
Expand All @@ -33,6 +40,9 @@ fn default_missing_value() -> i32 {
fn default_missing_string() -> std::string::String {
"MISSING".to_string()
}
fn default_description_string() -> std::string::String {
"added by echtvar".to_string()
}
fn default_multiplier() -> u32 {
1
}
Expand All @@ -47,9 +57,11 @@ impl Default for Field {
alias: "name".to_string(),
missing_value: -1,
missing_string: "MISSING".to_string(),
description: "added by echtvar".to_string(),
zigzag: false,
multiplier: 1,
ftype: FieldType::Integer,
number: ".".to_string(),
values_i: usize::MAX,
}
}
Expand Down
8 changes: 8 additions & 0 deletions tests/big.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ for mod in 2 3 4 5; do

# check that some variants remain unannotated
python3 check.py anno.vcf.gz $mod
# check custom INFO Description used from config
python3 check-string-for-issue33.py anno.vcf.gz aval 1 "added by echtvar TEST description field"
# check A/R/G converted to 1 for number
python3 check-string-for-issue33.py anno.vcf.gz external_AC 1 "added by echtvar Theoretical AC from another source"


if [[ "mod" -ne "1" ]]; then
Expand All @@ -24,6 +28,10 @@ for mod in 2 3 4 5; do
$echtvar encode test.echtvar1 test1.hjson generated-subset1.vcf
$echtvar anno generated-all.vcf -e test.echtvar0 -e test.echtvar1 anno.vcf.gz
python3 check.py anno.vcf.gz 1
# check default Description used
python3 check-string-for-issue33.py anno.vcf.gz aval1 1 "added by echtvar from test.echtvar1"
# check value . is left alone
python3 check-string-for-issue33.py anno.vcf.gz external_str . "added by echtvar from test.echtvar1"
fi


Expand Down
29 changes: 29 additions & 0 deletions tests/check-string-for-issue33.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""
Small test to ensure expected INFO fields generated
"""
import sys
import gzip

fpath = sys.argv[1]
field = sys.argv[2]
num = sys.argv[3]
desc = sys.argv[4]

ok = False
for line in gzip.open(sys.argv[1], 'rt'):
if line.startswith("##INFO=<ID={},".format(field)):
toks = line.rstrip("\n").split(",")
num_to_check = "Number={}".format(num)
assert toks[1].startswith(num_to_check), f"Expected Desciption=\"{num}\" for {field}, got {num_to_check} in {toks[1]}"
desc_to_check = "Description=\"{}\"".format(desc)
assert toks[3].startswith(desc_to_check), f"Expected Desciption=\"{desc}\" for {field}, got {desc_to_check} in {toks[3]}"
ok = True
break
# only want to look through header
elif line.startswith("#CHROM"):
break
if not ok:
print(f"Expected field {field} not found in INFO", file=sys.stderr)
sys.exit(1)
else:
sys.exit(0)
9 changes: 5 additions & 4 deletions tests/make-vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
print(header % 1, file=subset1_fh)

nvar = 0

str_vals = ["YES", "NO", "MAYBE"]
for switch in [1, 2, 3, 4, 5, 1132, 1133, 1134]:
switch = switch<<20

Expand All @@ -38,13 +38,14 @@
for alen in range(0, 5):
for balt in itertools.permutations("ACGT", alen):
val = random.randint(0, 10000000)
ac = random.randint(1, 3)
alt = ref[0] + "".join(balt)
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\tPASS\tval={val};nvar={nvar}", file=all_fh)
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\tPASS\tval={val};nvar={nvar};AC={ac};str={str_vals[(ac-1)]}", file=all_fh)

if nvar % mod == 0:
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\tPASS\tval0={val};nvar={nvar}", file=subset0_fh)
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\tPASS\tval0={val};nvar={nvar};AC={ac};str={str_vals[(ac-1)]}", file=subset0_fh)
else:
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\tPASS\tval1={val};nvar={nvar}", file=subset1_fh)
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\tPASS\tval1={val};nvar={nvar};AC={ac};str={str_vals[(ac-1)]}", file=subset1_fh)
nvar += 1

for ref in ["ACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC",
Expand Down
3 changes: 2 additions & 1 deletion tests/test0.hjson
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[
{"field": "val0", "alias": "aval"}
{"field": "val0", "alias": "aval", "description": "TEST description field"},
{"field": "AC", "alias": "external_AC", "description": "Theoretical AC from another source"}
]
3 changes: 2 additions & 1 deletion tests/test1.hjson
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[
{"field": "val1", "alias": "aval1"}
{"field": "val1", "alias": "aval1"},
{"field": "str", "alias": "external_str"}
]

0 comments on commit d4ab3a8

Please sign in to comment.