From c9c5f8c8eea4a86eef197f4299ca25213300b363 Mon Sep 17 00:00:00 2001 From: jodyphelan Date: Mon, 11 Oct 2021 19:06:03 +0100 Subject: [PATCH 01/11] add tests --- hAMRonization/TBProfilerIO.py | 5 +- .../raw_outputs/tbprofiler/tbprofiler.json | 926 ++++++++++++++++++ test/dummy/tbprofiler/tbprofiler.json | 91 ++ test/run_test.sh | 3 + test/test_sanity.py | 45 + 5 files changed, 1068 insertions(+), 2 deletions(-) create mode 100644 test/data/raw_outputs/tbprofiler/tbprofiler.json create mode 100644 test/dummy/tbprofiler/tbprofiler.json diff --git a/hAMRonization/TBProfilerIO.py b/hAMRonization/TBProfilerIO.py index b009202..ced6e7e 100644 --- a/hAMRonization/TBProfilerIO.py +++ b/hAMRonization/TBProfilerIO.py @@ -22,8 +22,8 @@ def __init__(self, source, metadata): 'frequency': 'variant_frequency', 'db_name': 'reference_database_id', 'db_version': 'reference_database_version', + 'software_name': 'analysis_software_version', 'tbprofiler_version': 'analysis_software_version' - } super().__init__(source, self.field_mapping, self.metadata) @@ -45,7 +45,8 @@ def parse(self, handle): 'frequency': variant['freq'], 'db_name': json_obj['db_version']['name'], 'db_version': json_obj['db_version']['commit'], - 'tbprofiler_version': json_obj['tbprofiler_version'] + 'tbprofiler_version': json_obj['tbprofiler_version'], + 'software_name': 'tb-profiler' } yield self.hAMRonize(result, self.metadata) diff --git a/test/data/raw_outputs/tbprofiler/tbprofiler.json b/test/data/raw_outputs/tbprofiler/tbprofiler.json new file mode 100644 index 0000000..b8cead3 --- /dev/null +++ b/test/data/raw_outputs/tbprofiler/tbprofiler.json @@ -0,0 +1,926 @@ +{ + "qc": { + "pct_reads_mapped": 97.16, + "num_reads_mapped": 4232866, + "median_coverage": 121, + "gene_coverage": [ + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv0005", + "gene": "gyrB" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv0006", + "gene": "gyrA" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv0407", + "gene": "fgd1" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv0486", + "gene": "mshA" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv0667", + "gene": "rpoB" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv0668", + "gene": "rpoC" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv0678", + "gene": "mmpR5" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv0682", + "gene": "rpsL" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv0701", + "gene": "rplC" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv1173", + "gene": "fbiC" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv1267c", + "gene": "embR" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv1305", + "gene": "atpE" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "EBG00000313325", + "gene": "rrs" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "EBG00000313339", + "gene": "rrl" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv1483", + "gene": "fabG1" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv1484", + "gene": "inhA" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv1630", + "gene": "rpsA" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv1694", + "gene": "tlyA" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv1908c", + "gene": "katG" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv2043c", + "gene": "pncA" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv2245", + "gene": "kasA" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv2416c", + "gene": "eis" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv2428", + "gene": "ahpC" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv2447c", + "gene": "folC" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv2535c", + "gene": "pepQ" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv2671", + "gene": "ribD" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv2754c", + "gene": "thyX" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv2764c", + "gene": "thyA" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv2780", + "gene": "ald" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv2983", + "gene": "fbiD" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3261", + "gene": "fbiA" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3262", + "gene": "fbiB" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3423c", + "gene": "alr" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3547", + "gene": "ddn" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3601c", + "gene": "panD" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3793", + "gene": "embC" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3794", + "gene": "embA" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3795", + "gene": "embB" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3806c", + "gene": "ubiA" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3854c", + "gene": "ethA" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3855", + "gene": "ethR" + }, + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3919c", + "gene": "gid" + } + ], + "missing_positions": [] + }, + "delly": "success", + "lineage": [ + { + "lin": "lineage3", + "family": "East-African-Indian", + "spoligotype": "CAS", + "rd": "RD750", + "frac": 0.4586267605633803 + }, + { + "lin": "lineage2", + "family": "East-Asian", + "spoligotype": "Beijing", + "rd": "RD105", + "frac": 0.5208510638297872 + }, + { + "lin": "lineage2.2", + "family": "East-Asian (Beijing)", + "spoligotype": "Beijing-RD207", + "rd": "RD105;RD207", + "frac": 0.5471512770137524 + }, + { + "lin": "lineage2.2.1", + "family": "East-Asian (Beijing)", + "spoligotype": "Beijing-RD181", + "rd": "RD105;RD207;RD181", + "frac": 0.5304114490161002 + } + ], + "main_lin": "lineage2;lineage3", + "sublin": "lineage3;lineage2.2.1", + "dr_variants": [ + { + "chrom": "Chromosome", + "genome_pos": 6750, + "ref": "C", + "alt": "T", + "freq": 0.515625, + "feature_id": "CCP42727", + "type": "missense_variant", + "nucleotide_change": "c.1511C>T", + "protein_change": "p.Ala504Val", + "alternate_consequences": [], + "change": "p.Ala504Val", + "locus_tag": "Rv0005", + "gene": "gyrB", + "drugs": [ + { + "type": "drug", + "drug": "ciprofloxacin", + "confidence": "indeterminate" + }, + { + "type": "drug", + "drug": "fluoroquinolones", + "confidence": "indeterminate" + }, + { + "type": "drug", + "drug": "levofloxacin", + "confidence": "indeterminate" + }, + { + "type": "drug", + "drug": "moxifloxacin", + "confidence": "indeterminate" + }, + { + "type": "drug", + "drug": "ofloxacin", + "confidence": "indeterminate" + } + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 7570, + "ref": "C", + "alt": "T", + "freq": 0.5472972972972973, + "feature_id": "CCP42728", + "type": "missense_variant", + "nucleotide_change": "c.269C>T", + "protein_change": "p.Ala90Val", + "alternate_consequences": [], + "change": "p.Ala90Val", + "locus_tag": "Rv0006", + "gene": "gyrA", + "drugs": [ + { + "type": "drug", + "drug": "ciprofloxacin", + "confidence": "high" + }, + { + "type": "drug", + "drug": "fluoroquinolones", + "confidence": "indeterminate" + }, + { + "type": "drug", + "drug": "levofloxacin", + "confidence": "indeterminate" + }, + { + "type": "drug", + "drug": "moxifloxacin", + "confidence": "moderate" + }, + { + "type": "drug", + "drug": "ofloxacin", + "confidence": "high" + } + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 761155, + "ref": "C", + "alt": "T", + "freq": 0.5754716981132075, + "feature_id": "CCP43410", + "type": "missense_variant", + "nucleotide_change": "c.1349C>T", + "protein_change": "p.Ser450Leu", + "alternate_consequences": [], + "change": "p.Ser450Leu", + "locus_tag": "Rv0667", + "gene": "rpoB", + "drugs": [ + { + "type": "drug", + "drug": "rifampicin", + "confidence": "high", + "literature": "10.1128/AAC.01093-18" + } + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 1472362, + "ref": "C", + "alt": "T", + "freq": 0.5875, + "feature_id": "EBG00000313325-1", + "type": "non_coding_transcript_exon_variant", + "nucleotide_change": "n.517C>T", + "protein_change": "", + "alternate_consequences": [], + "change": "n.517C>T", + "locus_tag": "EBG00000313325", + "gene": "rrs", + "drugs": [ + { + "type": "drug", + "drug": "streptomycin", + "confidence": "indeterminate" + } + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 1473246, + "ref": "A", + "alt": "G", + "freq": 0.5617977528089888, + "feature_id": "EBG00000313325-1", + "type": "non_coding_transcript_exon_variant", + "nucleotide_change": "n.1401A>G", + "protein_change": "", + "alternate_consequences": [], + "change": "n.1401A>G", + "locus_tag": "EBG00000313325", + "gene": "rrs", + "drugs": [ + { + "type": "drug", + "drug": "amikacin", + "confidence": "indeterminate" + }, + { + "type": "drug", + "drug": "aminoglycosides", + "confidence": "indeterminate" + }, + { + "type": "drug", + "drug": "capreomycin", + "confidence": "indeterminate" + }, + { + "type": "drug", + "drug": "kanamycin", + "confidence": "indeterminate" + } + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 2155168, + "ref": "C", + "alt": "G", + "freq": 0.47674418604651164, + "feature_id": "CCP44675", + "type": "missense_variant", + "nucleotide_change": "c.944G>C", + "protein_change": "p.Ser315Thr", + "alternate_consequences": [], + "change": "p.Ser315Thr", + "locus_tag": "Rv1908c", + "gene": "katG", + "drugs": [ + { + "type": "drug", + "drug": "isoniazid", + "confidence": "high" + } + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 2289031, + "ref": "G", + "alt": "A", + "freq": 0.4520547945205479, + "feature_id": "CCP44816", + "type": "missense_variant", + "nucleotide_change": "c.211C>T", + "protein_change": "p.His71Tyr", + "alternate_consequences": [], + "change": "p.His71Tyr", + "locus_tag": "Rv2043c", + "gene": "pncA", + "drugs": [ + { + "type": "drug", + "drug": "pyrazinamide", + "confidence": "high" + } + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 4247574, + "ref": "A", + "alt": "C", + "freq": 0.6280487804878049, + "feature_id": "CCP46624", + "type": "missense_variant", + "nucleotide_change": "c.1061A>C", + "protein_change": "p.Asp354Ala", + "alternate_consequences": [], + "change": "p.Asp354Ala", + "locus_tag": "Rv3795", + "gene": "embB", + "drugs": [ + { + "type": "drug", + "drug": "ethambutol", + "confidence": "moderate" + } + ] + } + ], + "other_variants": [ + { + "chrom": "Chromosome", + "genome_pos": 7362, + "ref": "G", + "alt": "C", + "freq": 1, + "feature_id": "CCP42728", + "type": "missense_variant", + "nucleotide_change": "c.61G>C", + "protein_change": "p.Glu21Gln", + "alternate_consequences": [], + "change": "p.Glu21Gln", + "locus_tag": "Rv0006", + "gene": "gyrA", + "gene_associated_drugs": [ + "fluoroquinolones", + "ciprofloxacin", + "ofloxacin", + "levofloxacin", + "moxifloxacin" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 7585, + "ref": "G", + "alt": "C", + "freq": 1, + "feature_id": "CCP42728", + "type": "missense_variant", + "nucleotide_change": "c.284G>C", + "protein_change": "p.Ser95Thr", + "alternate_consequences": [], + "change": "p.Ser95Thr", + "locus_tag": "Rv0006", + "gene": "gyrA", + "gene_associated_drugs": [ + "fluoroquinolones", + "ciprofloxacin", + "ofloxacin", + "levofloxacin", + "moxifloxacin" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 9304, + "ref": "G", + "alt": "A", + "freq": 1, + "feature_id": "CCP42728", + "type": "missense_variant", + "nucleotide_change": "c.2003G>A", + "protein_change": "p.Gly668Asp", + "alternate_consequences": [], + "change": "p.Gly668Asp", + "locus_tag": "Rv0006", + "gene": "gyrA", + "gene_associated_drugs": [ + "fluoroquinolones", + "ciprofloxacin", + "ofloxacin", + "levofloxacin", + "moxifloxacin" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 491742, + "ref": "T", + "alt": "C", + "freq": 1, + "feature_id": "CCP43138", + "type": "synonymous_variant", + "nucleotide_change": "c.960T>C", + "protein_change": "p.Phe320Phe", + "alternate_consequences": [], + "change": "c.960T>C", + "locus_tag": "Rv0407", + "gene": "fgd1", + "gene_associated_drugs": [ + "delamanid" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 575907, + "ref": "C", + "alt": "T", + "freq": 0.5153374233128835, + "feature_id": "CCP43220", + "type": "missense_variant", + "nucleotide_change": "c.560C>T", + "protein_change": "p.Ala187Val", + "alternate_consequences": [], + "change": "p.Ala187Val", + "locus_tag": "Rv0486", + "gene": "mshA", + "gene_associated_drugs": [ + "ethionamide" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 576108, + "ref": "C", + "alt": "G", + "freq": 0.2608695652173913, + "feature_id": "CCP43220", + "type": "missense_variant", + "nucleotide_change": "c.761C>G", + "protein_change": "p.Ala254Gly", + "alternate_consequences": [], + "change": "p.Ala254Gly", + "locus_tag": "Rv0486", + "gene": "mshA", + "gene_associated_drugs": [ + "ethionamide" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 762434, + "ref": "T", + "alt": "G", + "freq": 0.5086206896551724, + "feature_id": "CCP43410", + "type": "synonymous_variant", + "nucleotide_change": "c.2628T>G", + "protein_change": "p.Gly876Gly", + "alternate_consequences": [], + "change": "c.2628T>G", + "locus_tag": "Rv0667", + "gene": "rpoB", + "gene_associated_drugs": [ + "rifampicin" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 763031, + "ref": "T", + "alt": "C", + "freq": 1, + "feature_id": "CCP43410", + "type": "synonymous_variant", + "nucleotide_change": "c.3225T>C", + "protein_change": "p.Ala1075Ala", + "alternate_consequences": [], + "change": "c.3225T>C", + "locus_tag": "Rv0667", + "gene": "rpoB", + "gene_associated_drugs": [ + "rifampicin" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 765846, + "ref": "A", + "alt": "C", + "freq": 0.4755244755244755, + "feature_id": "CCP43411", + "type": "missense_variant", + "nucleotide_change": "c.2477A>C", + "protein_change": "p.Asn826Thr", + "alternate_consequences": [], + "change": "p.Asn826Thr", + "locus_tag": "Rv0668", + "gene": "rpoC", + "gene_associated_drugs": [ + "rifampicin" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 766645, + "ref": "A", + "alt": "C", + "freq": 0.4566929133858268, + "feature_id": "CCP43411", + "type": "missense_variant", + "nucleotide_change": "c.3276A>C", + "protein_change": "p.Glu1092Asp", + "alternate_consequences": [], + "change": "p.Glu1092Asp", + "locus_tag": "Rv0668", + "gene": "rpoC", + "gene_associated_drugs": [ + "rifampicin" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 1834177, + "ref": "A", + "alt": "C", + "freq": 0.5540540540540541, + "feature_id": "CCP44394", + "type": "synonymous_variant", + "nucleotide_change": "c.636A>C", + "protein_change": "p.Arg212Arg", + "alternate_consequences": [], + "change": "c.636A>C", + "locus_tag": "Rv1630", + "gene": "rpsA", + "gene_associated_drugs": [ + "pyrazinamide" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 2154724, + "ref": "C", + "alt": "A", + "freq": 1, + "feature_id": "CCP44675", + "type": "missense_variant", + "nucleotide_change": "c.1388G>T", + "protein_change": "p.Arg463Leu", + "alternate_consequences": [], + "change": "p.Arg463Leu", + "locus_tag": "Rv1908c", + "gene": "katG", + "gene_associated_drugs": [ + "isoniazid" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 2289047, + "ref": "G", + "alt": "A", + "freq": 0.5294117647058824, + "feature_id": "CCP44816", + "type": "synonymous_variant", + "nucleotide_change": "c.195C>T", + "protein_change": "p.Ser65Ser", + "alternate_consequences": [], + "change": "c.195C>T", + "locus_tag": "Rv2043c", + "gene": "pncA", + "gene_associated_drugs": [ + "pyrazinamide" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 2289365, + "ref": "CG", + "alt": "C", + "freq": 0.5617977528089888, + "feature_id": "CCP44816", + "type": "upstream_gene_variant", + "nucleotide_change": "c.-125delC", + "protein_change": "", + "alternate_consequences": [], + "change": "c.-125delC", + "locus_tag": "Rv2043c", + "gene": "pncA", + "gene_associated_drugs": [ + "pyrazinamide" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 4242075, + "ref": "G", + "alt": "A", + "freq": 0.5904761904761905, + "feature_id": "CCP46622", + "type": "missense_variant", + "nucleotide_change": "c.2213G>A", + "protein_change": "p.Arg738Gln", + "alternate_consequences": [], + "change": "p.Arg738Gln", + "locus_tag": "Rv3793", + "gene": "embC", + "gene_associated_drugs": [ + "ethambutol" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 4242643, + "ref": "C", + "alt": "T", + "freq": 1, + "feature_id": "CCP46622", + "type": "synonymous_variant", + "nucleotide_change": "c.2781C>T", + "protein_change": "p.Arg927Arg", + "alternate_consequences": [], + "change": "c.2781C>T", + "locus_tag": "Rv3793", + "gene": "embC", + "gene_associated_drugs": [ + "ethambutol" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 4243460, + "ref": "C", + "alt": "T", + "freq": 0.4484848484848485, + "feature_id": "CCP46623", + "type": "synonymous_variant", + "nucleotide_change": "c.228C>T", + "protein_change": "p.Cys76Cys", + "alternate_consequences": [], + "change": "c.228C>T", + "locus_tag": "Rv3794", + "gene": "embA", + "gene_associated_drugs": [ + "ethambutol" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 4245113, + "ref": "C", + "alt": "T", + "freq": 0.4107142857142857, + "feature_id": "CCP46623", + "type": "synonymous_variant", + "nucleotide_change": "c.1881C>T", + "protein_change": "p.Val627Val", + "alternate_consequences": [], + "change": "c.1881C>T", + "locus_tag": "Rv3794", + "gene": "embA", + "gene_associated_drugs": [ + "ethambutol" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 4327480, + "ref": "A", + "alt": "G", + "freq": 0.5813953488372093, + "feature_id": "CCP46683", + "type": "upstream_gene_variant", + "nucleotide_change": "c.-7T>C", + "protein_change": "", + "alternate_consequences": [ + { + "gene_name": "ethR", + "gene_id": "Rv3855", + "feature_id": "CCP46684", + "type": "upstream_gene_variant", + "nucleotide_change": "c.-69A>G", + "protein_change": "" + } + ], + "change": "c.-7T>C", + "locus_tag": "Rv3854c", + "gene": "ethA", + "gene_associated_drugs": [ + "ethionamide" + ] + }, + { + "chrom": "Chromosome", + "genome_pos": 4407927, + "ref": "T", + "alt": "G", + "freq": 0.5190839694656488, + "feature_id": "CCP46748", + "type": "missense_variant", + "nucleotide_change": "c.276A>C", + "protein_change": "p.Glu92Asp", + "alternate_consequences": [], + "change": "p.Glu92Asp", + "locus_tag": "Rv3919c", + "gene": "gid", + "gene_associated_drugs": [ + "streptomycin" + ] + } + ], + "drtype": "Pre-XDR", + "db_version": { + "name": "tbdb", + "commit": "a800e0a", + "Author": "jodyphelan ", + "Date": "Mon Sep 27 08:31:28 2021 +0100" + }, + "id": "SRR6916544", + "tbprofiler_version": "3.0.8", + "pipeline": { + "mapper": "bwa", + "variant_caller": "bcftools" + }, + "timestamp": "11-10-2021 16:43:40" +} diff --git a/test/dummy/tbprofiler/tbprofiler.json b/test/dummy/tbprofiler/tbprofiler.json new file mode 100644 index 0000000..3cdb9fb --- /dev/null +++ b/test/dummy/tbprofiler/tbprofiler.json @@ -0,0 +1,91 @@ +{ + "qc": { + "pct_reads_mapped": 97.16, + "num_reads_mapped": 4232866, + "median_coverage": 121, + "gene_coverage": [ + { + "fraction": 0, + "cutoff": 0, + "locus_tag": "Rv3919c", + "gene": "gid" + } + ], + "missing_positions": [] + }, + "delly": "success", + "lineage": [ + { + "lin": "lineage3", + "family": "East-African-Indian", + "spoligotype": "CAS", + "rd": "RD750", + "frac": 0.4586267605633803 + }, + { + "lin": "lineage2", + "family": "East-Asian", + "spoligotype": "Beijing", + "rd": "RD105", + "frac": 0.5208510638297872 + }, + { + "lin": "lineage2.2", + "family": "East-Asian (Beijing)", + "spoligotype": "Beijing-RD207", + "rd": "RD105;RD207", + "frac": 0.5471512770137524 + }, + { + "lin": "lineage2.2.1", + "family": "East-Asian (Beijing)", + "spoligotype": "Beijing-RD181", + "rd": "RD105;RD207;RD181", + "frac": 0.5304114490161002 + } + ], + "main_lin": "lineage2;lineage3", + "sublin": "lineage3;lineage2.2.1", + "dr_variants": [ + { + "chrom": "Chromosome", + "genome_pos": 761155, + "ref": "C", + "alt": "T", + "freq": 0.5754716981132075, + "feature_id": "CCP43410", + "type": "missense_variant", + "nucleotide_change": "c.1349C>T", + "protein_change": "p.Ser450Leu", + "alternate_consequences": [], + "change": "p.Ser450Leu", + "locus_tag": "Rv0667", + "gene": "rpoB", + "drugs": [ + { + "type": "drug", + "drug": "rifampicin", + "confidence": "high", + "literature": "10.1128/AAC.01093-18" + } + ] + } + ], + "other_variants": [ + + ], + "drtype": "Pre-XDR", + "db_version": { + "name": "tbdb", + "commit": "a800e0a", + "Author": "jodyphelan ", + "Date": "Mon Sep 27 08:31:28 2021 +0100" + }, + "id": "SRR6916544", + "tbprofiler_version": "3.0.8", + "pipeline": { + "mapper": "bwa", + "variant_caller": "bcftools" + }, + "timestamp": "11-10-2021 16:43:40" +} diff --git a/test/run_test.sh b/test/run_test.sh index a9bd6d3..ab902a0 100644 --- a/test/run_test.sh +++ b/test/run_test.sh @@ -1,6 +1,9 @@ #!/bin/bash set -e + +hamronize tb-profiler data/raw_outputs/tbprofiler/tbprofiler.json --format json --output hamronized_tbprofiler.json + hamronize abricate data/raw_outputs/abricate/report.tsv --reference_database_version db_v_1 --analysis_software_version tool_v_1 --format json --output hamronized_abricate.json hamronize abricate data/raw_outputs/abricate/report.tsv --reference_database_version db_v_1 --analysis_software_version tool_v_1 --format tsv --output hamronized_abricate.tsv diff --git a/test/test_sanity.py b/test/test_sanity.py index 75e597e..46e3474 100644 --- a/test/test_sanity.py +++ b/test/test_sanity.py @@ -550,3 +550,48 @@ def test_csstar(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None + + + +def test_tbprofiler(): + metadata = {"input_file_name": "Dummy"} + parsed_report = hAMRonization.parse("dummy/tbprofiler/tbprofiler.json", metadata, "csstar") + + for result in parsed_report: + # assert mandatory fields + assert result.input_file_name == 'Dummy' + assert result.gene_symbol == 'rpoB' + assert result.gene_name == 'rpoB' + assert result.reference_database_id == 'tbdb' + assert result.reference_database_version == 'a800e0a' + assert result.analysis_software_name == 'tb-profiler' + assert result.analysis_software_version == '3.0.8' + assert result.genetic_variation_type == 'protein_variant' + + # optional fields - present in dummy dataset + assert result.drug_class == 'rifampicin' + + # mandatory but missing + assert result.reference_accession is None + assert result.sequence_identity is None + + # missing data in report + assert result.reference_gene_length is None + assert result.input_gene_length is None + assert result.input_sequence_id is None + assert result.coverage_percentage is None + assert result.coverage_depth is None + assert result.input_gene_start is None + assert result.input_gene_stop is None + assert result.strand_orientation is None + assert result.antimicrobial_agent is None + assert result.reference_protein_length is None + assert result.coverage_ratio is None + assert result.input_protein_length is None + assert result.resistance_mechanism is None + assert result.input_protein_start is None + assert result.input_protein_stop is None + assert result.reference_protein_start is None + assert result.reference_protein_stop is None + assert result.reference_gene_start is None + assert result.reference_gene_stop is None From beb06f56ae36ea7ccc386f3712d580327ba5019a Mon Sep 17 00:00:00 2001 From: jodyphelan Date: Mon, 11 Oct 2021 19:09:24 +0100 Subject: [PATCH 02/11] fix typo --- test/test_sanity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_sanity.py b/test/test_sanity.py index 46e3474..839ff24 100644 --- a/test/test_sanity.py +++ b/test/test_sanity.py @@ -555,7 +555,7 @@ def test_csstar(): def test_tbprofiler(): metadata = {"input_file_name": "Dummy"} - parsed_report = hAMRonization.parse("dummy/tbprofiler/tbprofiler.json", metadata, "csstar") + parsed_report = hAMRonization.parse("dummy/tbprofiler/tbprofiler.json", metadata, "tbprofiler") for result in parsed_report: # assert mandatory fields From 66aa36509e321edcafe3f8af4a09314ec5f359dd Mon Sep 17 00:00:00 2001 From: jodyphelan Date: Mon, 11 Oct 2021 19:17:27 +0100 Subject: [PATCH 03/11] add tbprofiler to init --- hAMRonization/__init__.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/hAMRonization/__init__.py b/hAMRonization/__init__.py index f1a6750..f8a3fc6 100644 --- a/hAMRonization/__init__.py +++ b/hAMRonization/__init__.py @@ -17,7 +17,7 @@ from hAMRonization import CSStarIO from hAMRonization import AmrPlusPlusIO from hAMRonization import ResFamsIO - +from hAMRonization import TBProfilerIO _FormatToIterator = { "abricate": AbricateIO.AbricateIterator, @@ -34,7 +34,8 @@ "staramr": StarAmrIO.StarAmrIterator, "csstar": CSStarIO.CSStarIterator, "amrplusplus": AmrPlusPlusIO.AmrPlusPlusIterator, - "resfams": ResFamsIO.ResFamsIterator + "resfams": ResFamsIO.ResFamsIterator, + "tbprofiler": TBProfilerIO.TBProfilerIterator } _ReportFileToUse = { @@ -52,7 +53,8 @@ "staramr": "resfinder.tsv", "csstar": "OUTPUT.tsv", "amrplusplus": "gene.tsv", - "resfams": "resfams.tblout" + "resfams": "resfams.tblout", + "tbprofiler": "OUTPUT.results.json" } @@ -72,6 +74,7 @@ "amrplusplus": AmrPlusPlusIO.required_metadata, "resfams": ResFamsIO.required_metadata, "groot": GrootIO.required_metadata, + "tbprofiler": TBProfilerIO.required_metadata, } From 987be51f6a59ddfcc03df830b273ac8ec4b497c2 Mon Sep 17 00:00:00 2001 From: jodyphelan Date: Mon, 11 Oct 2021 19:26:56 +0100 Subject: [PATCH 04/11] set required metadata --- hAMRonization/TBProfilerIO.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hAMRonization/TBProfilerIO.py b/hAMRonization/TBProfilerIO.py index ced6e7e..b6e7668 100644 --- a/hAMRonization/TBProfilerIO.py +++ b/hAMRonization/TBProfilerIO.py @@ -3,8 +3,7 @@ import json from .Interfaces import hAMRonizedResultIterator -required_metadata = ['analysis_software_version', - 'reference_database_version'] +required_metadata = [] class TBProfilerIterator(hAMRonizedResultIterator): From 51ce299918166e005e39aac39d68325d36def38b Mon Sep 17 00:00:00 2001 From: jodyphelan Date: Mon, 11 Oct 2021 19:39:35 +0100 Subject: [PATCH 05/11] add in new fields --- hAMRonization/TBProfilerIO.py | 6 ++++-- hAMRonization/hAMRonizedResult.py | 6 +++++- test/test_sanity.py | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/hAMRonization/TBProfilerIO.py b/hAMRonization/TBProfilerIO.py index b6e7668..2e00313 100644 --- a/hAMRonization/TBProfilerIO.py +++ b/hAMRonization/TBProfilerIO.py @@ -22,7 +22,8 @@ def __init__(self, source, metadata): 'db_name': 'reference_database_id', 'db_version': 'reference_database_version', 'software_name': 'analysis_software_version', - 'tbprofiler_version': 'analysis_software_version' + 'tbprofiler_version': 'analysis_software_version', + 'reference_accession': 'reference_accession' } super().__init__(source, self.field_mapping, self.metadata) @@ -45,7 +46,8 @@ def parse(self, handle): 'db_name': json_obj['db_version']['name'], 'db_version': json_obj['db_version']['commit'], 'tbprofiler_version': json_obj['tbprofiler_version'], - 'software_name': 'tb-profiler' + 'software_name': 'tb-profiler', + 'reference_accession': variant['feature_id'] } yield self.hAMRonize(result, self.metadata) diff --git a/hAMRonization/hAMRonizedResult.py b/hAMRonization/hAMRonizedResult.py index 97a0674..ba3cfd1 100644 --- a/hAMRonization/hAMRonizedResult.py +++ b/hAMRonization/hAMRonizedResult.py @@ -19,7 +19,11 @@ class hAMRonizedResult(): reference_accession: str analysis_software_name: str analysis_software_version: str - + + # variant specific optional fields + variant_frequency: float + genetic_variation_type: str + # optional fields sequence_identity: float = None input_sequence_id: str = None diff --git a/test/test_sanity.py b/test/test_sanity.py index 839ff24..8719094 100644 --- a/test/test_sanity.py +++ b/test/test_sanity.py @@ -567,12 +567,12 @@ def test_tbprofiler(): assert result.analysis_software_name == 'tb-profiler' assert result.analysis_software_version == '3.0.8' assert result.genetic_variation_type == 'protein_variant' + assert result.reference_accession == 'CCP43410' # optional fields - present in dummy dataset assert result.drug_class == 'rifampicin' # mandatory but missing - assert result.reference_accession is None assert result.sequence_identity is None # missing data in report From 58ab313c9502e48f3accbae9cb01c62299ffcb5d Mon Sep 17 00:00:00 2001 From: jodyphelan Date: Mon, 11 Oct 2021 19:44:33 +0100 Subject: [PATCH 06/11] add in new fields --- test/dummy/tbprofiler/tbprofiler.json | 2 +- test/test_sanity.py | 44 ++++++++++++++++++--------- 2 files changed, 30 insertions(+), 16 deletions(-) diff --git a/test/dummy/tbprofiler/tbprofiler.json b/test/dummy/tbprofiler/tbprofiler.json index 3cdb9fb..e2732a5 100644 --- a/test/dummy/tbprofiler/tbprofiler.json +++ b/test/dummy/tbprofiler/tbprofiler.json @@ -52,7 +52,7 @@ "genome_pos": 761155, "ref": "C", "alt": "T", - "freq": 0.5754716981132075, + "freq": 1, "feature_id": "CCP43410", "type": "missense_variant", "nucleotide_change": "c.1349C>T", diff --git a/test/test_sanity.py b/test/test_sanity.py index 8719094..bad9048 100644 --- a/test/test_sanity.py +++ b/test/test_sanity.py @@ -50,7 +50,8 @@ def test_abricate(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None - + assert result.variant_frequency is None + assert result.genetic_variation_type is None def test_amrfinder(): metadata = {"analysis_software_version": "3.6.10", "reference_database_version": "2019-Jul-28", @@ -92,7 +93,8 @@ def test_amrfinder(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None - + assert result.variant_frequency is None + assert result.genetic_variation_type is None def test_amrplusplus(): metadata = {"analysis_software_version": "0.0.1", "reference_database_version": "2019-Jul-28", @@ -134,7 +136,8 @@ def test_amrplusplus(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None - + assert result.variant_frequency is None + assert result.genetic_variation_type is None def test_ariba(): metadata = {"analysis_software_version": "0.0.1", "reference_database_version": "2019-Jul-28", @@ -176,7 +179,8 @@ def test_ariba(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None - + assert result.variant_frequency is None + assert result.genetic_variation_type is None def test_kmerresistance(): metadata = {"analysis_software_version": "0.0.1", "reference_database_version": "2019-Jul-28", @@ -218,7 +222,8 @@ def test_kmerresistance(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None - + assert result.variant_frequency is None + assert result.genetic_variation_type is None def test_resfinder(): metadata = {"analysis_software_version": "0.0.1", "reference_database_version": "2019-Jul-28"} @@ -259,6 +264,8 @@ def test_resfinder(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None + assert result.variant_frequency is None + assert result.genetic_variation_type is None def test_resfinder4(): metadata = {"analysis_software_version": "0.0.1", "reference_database_version": "2019-Jul-28", "input_file_name": "Dummy"} @@ -299,6 +306,9 @@ def test_resfinder4(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None + assert result.variant_frequency is None + assert result.genetic_variation_type is None + def test_rgi(): metadata = {"analysis_software_version": "5.1.0", "reference_database_version": "2019-Jul-28", @@ -340,7 +350,8 @@ def test_rgi(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None - + assert result.variant_frequency is None + assert result.genetic_variation_type is None def test_srax(): metadata = {"analysis_software_version": "5.1.0", "reference_database_version": "2019-Jul-28", @@ -382,7 +393,8 @@ def test_srax(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None - + assert result.variant_frequency is None + assert result.genetic_variation_type is None def test_groot(): metadata = {"analysis_software_version": "0.0.1", "reference_database_version": "2019-Jul-28", @@ -424,7 +436,8 @@ def test_groot(): assert result.drug_class is None assert result.sequence_identity is None assert result.coverage_percentage is None - + assert result.variant_frequency is None + assert result.genetic_variation_type is None def test_deeparg(): metadata = {"analysis_software_version": "0.0.1", "reference_database_version": "2019-Jul-28", @@ -466,7 +479,8 @@ def test_deeparg(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None - + assert result.variant_frequency is None + assert result.genetic_variation_type is None def test_srst2(): metadata = {"analysis_software_version": "0.0.1", "reference_database_version": "2019-Jul-28", @@ -508,7 +522,8 @@ def test_srst2(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None - + assert result.variant_frequency is None + assert result.genetic_variation_type is None def test_csstar(): metadata = {"analysis_software_version": "0.0.1", "reference_database_version": "2019-Jul-28", @@ -550,8 +565,8 @@ def test_csstar(): assert result.reference_protein_stop is None assert result.reference_gene_start is None assert result.reference_gene_stop is None - - + assert result.variant_frequency is None + assert result.genetic_variation_type is None def test_tbprofiler(): metadata = {"input_file_name": "Dummy"} @@ -571,11 +586,10 @@ def test_tbprofiler(): # optional fields - present in dummy dataset assert result.drug_class == 'rifampicin' - - # mandatory but missing - assert result.sequence_identity is None + assert result.variant_frequency == 1 # missing data in report + assert result.sequence_identity is None assert result.reference_gene_length is None assert result.input_gene_length is None assert result.input_sequence_id is None From 6950fce24c21bffffac01bfb92b4b4ce551f76bd Mon Sep 17 00:00:00 2001 From: jodyphelan Date: Mon, 11 Oct 2021 19:55:14 +0100 Subject: [PATCH 07/11] fix tests --- hAMRonization/TBProfilerIO.py | 6 +++--- hAMRonization/hAMRonizedResult.py | 6 +++--- test/run_test.sh | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/hAMRonization/TBProfilerIO.py b/hAMRonization/TBProfilerIO.py index 2e00313..51442d8 100644 --- a/hAMRonization/TBProfilerIO.py +++ b/hAMRonization/TBProfilerIO.py @@ -14,8 +14,8 @@ def __init__(self, source, metadata): self.field_mapping = { 'filename': 'input_file_name', - 'gene': 'gene_symbol', - 'gene': 'gene_name', + 'gene_symbol': 'gene_symbol', + 'gene_name': 'gene_name', 'drug': 'drug_class', 'type': 'genetic_variation_type', 'frequency': 'variant_frequency', @@ -38,7 +38,7 @@ def parse(self, handle): for drug in variant["drugs"]: result = { 'filename': handle.name, - 'gene': variant['gene'], + 'gene_symbol': variant['gene'], 'gene_name': variant['gene'], 'drug': drug['drug'], 'type': 'protein_variant' if variant['change'][0]=="p" else "nucleotide_variant", diff --git a/hAMRonization/hAMRonizedResult.py b/hAMRonization/hAMRonizedResult.py index ba3cfd1..7d2005c 100644 --- a/hAMRonization/hAMRonizedResult.py +++ b/hAMRonization/hAMRonizedResult.py @@ -21,9 +21,9 @@ class hAMRonizedResult(): analysis_software_version: str # variant specific optional fields - variant_frequency: float - genetic_variation_type: str - + variant_frequency: float = None + genetic_variation_type: str = None + # optional fields sequence_identity: float = None input_sequence_id: str = None diff --git a/test/run_test.sh b/test/run_test.sh index ab902a0..4758850 100644 --- a/test/run_test.sh +++ b/test/run_test.sh @@ -2,7 +2,7 @@ set -e -hamronize tb-profiler data/raw_outputs/tbprofiler/tbprofiler.json --format json --output hamronized_tbprofiler.json +hamronize tbprofiler data/raw_outputs/tbprofiler/tbprofiler.json --format json --output hamronized_tbprofiler.json hamronize abricate data/raw_outputs/abricate/report.tsv --reference_database_version db_v_1 --analysis_software_version tool_v_1 --format json --output hamronized_abricate.json hamronize abricate data/raw_outputs/abricate/report.tsv --reference_database_version db_v_1 --analysis_software_version tool_v_1 --format tsv --output hamronized_abricate.tsv From 039096397da8e123994e3736578f8a15484dfe27 Mon Sep 17 00:00:00 2001 From: jodyphelan Date: Mon, 11 Oct 2021 19:58:02 +0100 Subject: [PATCH 08/11] fix tests --- test/test_sanity.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_sanity.py b/test/test_sanity.py index bad9048..4bc5b0d 100644 --- a/test/test_sanity.py +++ b/test/test_sanity.py @@ -569,12 +569,11 @@ def test_csstar(): assert result.genetic_variation_type is None def test_tbprofiler(): - metadata = {"input_file_name": "Dummy"} parsed_report = hAMRonization.parse("dummy/tbprofiler/tbprofiler.json", metadata, "tbprofiler") for result in parsed_report: # assert mandatory fields - assert result.input_file_name == 'Dummy' + assert result.input_file_name == 'dummy/tbprofiler/tbprofiler.json' assert result.gene_symbol == 'rpoB' assert result.gene_name == 'rpoB' assert result.reference_database_id == 'tbdb' From 48bd902775c88b60024e92afe84914e6c4f96cf4 Mon Sep 17 00:00:00 2001 From: jodyphelan Date: Mon, 11 Oct 2021 19:59:34 +0100 Subject: [PATCH 09/11] add metadata back in --- test/test_sanity.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_sanity.py b/test/test_sanity.py index 4bc5b0d..a27af88 100644 --- a/test/test_sanity.py +++ b/test/test_sanity.py @@ -569,6 +569,7 @@ def test_csstar(): assert result.genetic_variation_type is None def test_tbprofiler(): + metadata = {} parsed_report = hAMRonization.parse("dummy/tbprofiler/tbprofiler.json", metadata, "tbprofiler") for result in parsed_report: From 776c8991b9c53bcd8df00ddf99223facb8636d59 Mon Sep 17 00:00:00 2001 From: jodyphelan Date: Mon, 11 Oct 2021 20:01:28 +0100 Subject: [PATCH 10/11] fix typo --- test/test_sanity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_sanity.py b/test/test_sanity.py index a27af88..6778299 100644 --- a/test/test_sanity.py +++ b/test/test_sanity.py @@ -574,7 +574,7 @@ def test_tbprofiler(): for result in parsed_report: # assert mandatory fields - assert result.input_file_name == 'dummy/tbprofiler/tbprofiler.json' + assert result.input_file_name == 'tbprofiler.json' assert result.gene_symbol == 'rpoB' assert result.gene_name == 'rpoB' assert result.reference_database_id == 'tbdb' From 58512a2debf55918e1e75c1f68b6e8e2f2f4a079 Mon Sep 17 00:00:00 2001 From: jodyphelan Date: Mon, 11 Oct 2021 20:03:15 +0100 Subject: [PATCH 11/11] fix typo --- test/test_sanity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_sanity.py b/test/test_sanity.py index 6778299..8ed121a 100644 --- a/test/test_sanity.py +++ b/test/test_sanity.py @@ -579,7 +579,7 @@ def test_tbprofiler(): assert result.gene_name == 'rpoB' assert result.reference_database_id == 'tbdb' assert result.reference_database_version == 'a800e0a' - assert result.analysis_software_name == 'tb-profiler' + assert result.analysis_software_name == 'TBProfiler' assert result.analysis_software_version == '3.0.8' assert result.genetic_variation_type == 'protein_variant' assert result.reference_accession == 'CCP43410'