Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🔨 Scrape Number and Specify Description #34

Merged
merged 11 commits into from
Jan 31, 2024
1 change: 1 addition & 0 deletions src/commands/annotate_cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ pub fn annotate_main(
e
})
.collect();
EchtVars::add_cmd_header(&mut header, &ipath, &opath, &include_expr, epaths);

let parser = fasteval::Parser::new();
let mut slab = fasteval::Slab::new();
Expand Down
15 changes: 13 additions & 2 deletions src/commands/encoder_cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ pub fn encoder_main(vpaths: Vec<&str>, opath: &str, jpath: &str) {
let mut lookups = HashMap::new();

for f in fields.iter_mut() {
let (tt, _tl) = if f.field == "FILTER" {
let (tt, tl) = if f.field == "FILTER" {
(TagType::String, TagLength::Variable)
} else {
header
Expand Down Expand Up @@ -216,7 +216,18 @@ pub fn encoder_main(vpaths: Vec<&str>, opath: &str, jpath: &str) {
"[echtvar] unsupported field type: {:?} for field {}",
tt, f.field
),
}
};
match tl {
TagLength::Fixed(value) => f.number = value.to_string(),
TagLength::AltAlleles => f.number = "A".to_string(),
TagLength::Alleles => f.number = "R".to_string(),
TagLength::Genotypes => f.number = "G".to_string(),
TagLength::Variable => f.number = ".".to_string(),
_ => panic!(
"[echtvar] unsupported field length: {:?} for field {}",
tl, f.field
),
};
}

let zfile = std::fs::File::create(&zpath).unwrap();
Expand Down
24 changes: 22 additions & 2 deletions src/lib/echtvar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,21 +177,41 @@ impl EchtVars {
for e in &self.fields {
header.push_record(
format!(
"##INFO=<ID={},Number=1,Type={},Description=\"{}\">",
"##INFO=<ID={},Number={},Type={},Description=\"{}\">",
brentp marked this conversation as resolved.
Show resolved Hide resolved
e.alias,
if vec!["A", "R", "G"].iter().any(|n| n == &e.number) {
"1"
} else {
&e.number
},
if e.ftype == fields::FieldType::Integer {
"Integer"
} else if e.ftype == fields::FieldType::Categorical {
"String"
} else {
"Float"
},
format!("added by echtvar from {}", path)
if &e.description.to_string() == "added by echtvar" {
format!("added by echtvar from {}", path)
} else {
format!("added by echtvar {}", e.description.to_string())
}
)
.as_bytes(),
);
}
}
pub fn add_cmd_header(header: &mut bcf::header::Header, vpath: &str, opath: &str, include_expr: &Option<&str>, epaths: Vec<&str>) {
header.push_record(
format!(
"##echtvar_annoCommand=anno -i {:?} {} {} -e {:?}",
include_expr,
vpath,
opath,
epaths.join(" -e ")
).as_bytes(),
);
}

#[inline(always)]
pub fn set_position(
Expand Down
12 changes: 12 additions & 0 deletions src/lib/fields.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ pub struct Field {
pub missing_value: i32,
#[serde(default = "default_missing_string")]
pub missing_string: std::string::String,
#[serde(default = "default_description_string")]
pub description: std::string::String,

#[serde(default)]
pub zigzag: bool,
Expand All @@ -23,6 +25,11 @@ pub struct Field {
pub multiplier: u32,
#[serde(default)]
pub ftype: FieldType,
#[serde(default)]
pub number: std::string::String,



#[serde(default = "default_values_i", skip_serializing)]
pub values_i: usize,
}
Expand All @@ -33,6 +40,9 @@ fn default_missing_value() -> i32 {
fn default_missing_string() -> std::string::String {
"MISSING".to_string()
}
fn default_description_string() -> std::string::String {
"added by echtvar".to_string()
}
fn default_multiplier() -> u32 {
1
}
Expand All @@ -47,9 +57,11 @@ impl Default for Field {
alias: "name".to_string(),
missing_value: -1,
missing_string: "MISSING".to_string(),
description: "added by echtvar".to_string(),
zigzag: false,
multiplier: 1,
ftype: FieldType::Integer,
number: ".".to_string(),
values_i: usize::MAX,
}
}
Expand Down
8 changes: 8 additions & 0 deletions tests/big.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ for mod in 2 3 4 5; do

# check that some variants remain unannotated
python3 check.py anno.vcf.gz $mod
# check custom INFO Description used from config
python3 check-string-for-issue33.py anno.vcf.gz aval 1 "added by echtvar TEST description field"
# check A/R/G converted to 1 for number
python3 check-string-for-issue33.py anno.vcf.gz external_AC 1 "added by echtvar Theoretical AC from another source"


if [[ "mod" -ne "1" ]]; then
Expand All @@ -24,6 +28,10 @@ for mod in 2 3 4 5; do
$echtvar encode test.echtvar1 test1.hjson generated-subset1.vcf
$echtvar anno generated-all.vcf -e test.echtvar0 -e test.echtvar1 anno.vcf.gz
python3 check.py anno.vcf.gz 1
# check default Description used
python3 check-string-for-issue33.py anno.vcf.gz aval1 1 "added by echtvar from test.echtvar1"
# check value . is left alone
python3 check-string-for-issue33.py anno.vcf.gz external_str . "added by echtvar from test.echtvar1"
fi


Expand Down
29 changes: 29 additions & 0 deletions tests/check-string-for-issue33.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""
Small test to ensure expected INFO fields generated
"""
import sys
import gzip

fpath = sys.argv[1]
field = sys.argv[2]
num = sys.argv[3]
desc = sys.argv[4]

ok = False
for line in gzip.open(sys.argv[1], 'rt'):
if line.startswith("##INFO=<ID={},".format(field)):
toks = line.rstrip("\n").split(",")
num_to_check = "Number={}".format(num)
assert toks[1].startswith(num_to_check), f"Expected Desciption=\"{num}\" for {field}, got {num_to_check} in {toks[1]}"
desc_to_check = "Description=\"{}\"".format(desc)
assert toks[3].startswith(desc_to_check), f"Expected Desciption=\"{desc}\" for {field}, got {desc_to_check} in {toks[3]}"
ok = True
break
# only want to look through header
elif line.startswith("#CHROM"):
break
if not ok:
print(f"Expected field {field} not found in INFO", file=sys.stderr)
sys.exit(1)
else:
sys.exit(0)
9 changes: 5 additions & 4 deletions tests/make-vcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
print(header % 1, file=subset1_fh)

nvar = 0

str_vals = ["YES", "NO", "MAYBE"]
for switch in [1, 2, 3, 4, 5, 1132, 1133, 1134]:
switch = switch<<20

Expand All @@ -38,13 +38,14 @@
for alen in range(0, 5):
for balt in itertools.permutations("ACGT", alen):
val = random.randint(0, 10000000)
ac = random.randint(1, 3)
alt = ref[0] + "".join(balt)
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\tPASS\tval={val};nvar={nvar}", file=all_fh)
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\tPASS\tval={val};nvar={nvar};AC={ac};str={str_vals[(ac-1)]}", file=all_fh)

if nvar % mod == 0:
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\tPASS\tval0={val};nvar={nvar}", file=subset0_fh)
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\tPASS\tval0={val};nvar={nvar};AC={ac};str={str_vals[(ac-1)]}", file=subset0_fh)
else:
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\tPASS\tval1={val};nvar={nvar}", file=subset1_fh)
print(f"chr1\t{i}\t.\t{ref}\t{alt}\t1\tPASS\tval1={val};nvar={nvar};AC={ac};str={str_vals[(ac-1)]}", file=subset1_fh)
nvar += 1

for ref in ["ACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC",
Expand Down
3 changes: 2 additions & 1 deletion tests/test0.hjson
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[
{"field": "val0", "alias": "aval"}
{"field": "val0", "alias": "aval", "description": "TEST description field"},
{"field": "AC", "alias": "external_AC", "description": "Theoretical AC from another source"}
]
3 changes: 2 additions & 1 deletion tests/test1.hjson
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[
{"field": "val1", "alias": "aval1"}
{"field": "val1", "alias": "aval1"},
{"field": "str", "alias": "external_str"}
]
Loading