From 683a36100626f8bedbe9aa360adc0d16486e3955 Mon Sep 17 00:00:00 2001 From: Mark Woon Date: Tue, 13 Aug 2024 16:29:37 -0700 Subject: [PATCH] fix(subsetter): fix bugs in subsetter --- .../phenotype/model/GenePhenotype.java | 36 ++++++-- .../phenotype/model/HaplotypeRecord.java | 8 ++ .../pharmcat/reporter/format/HtmlFormat.java | 5 +- .../pharmcat/subsetter/Subsetter.java | 89 ++++++++++--------- .../pharmcat/subsetter/SummaryWriter.java | 8 +- 5 files changed, 87 insertions(+), 59 deletions(-) diff --git a/src/main/java/org/pharmgkb/pharmcat/phenotype/model/GenePhenotype.java b/src/main/java/org/pharmgkb/pharmcat/phenotype/model/GenePhenotype.java index f9bee373..a86dbbcb 100644 --- a/src/main/java/org/pharmgkb/pharmcat/phenotype/model/GenePhenotype.java +++ b/src/main/java/org/pharmgkb/pharmcat/phenotype/model/GenePhenotype.java @@ -30,7 +30,7 @@ public class GenePhenotype { private Map m_haplotypes; @Expose @SerializedName("activityValues") - private Map m_activityValues = new HashMap<>(); + private Map m_activityValues = new HashMap<>(); @Expose @SerializedName("diplotypes") private SortedSet m_diplotypes; @@ -179,9 +179,27 @@ public String toString() { public void addHaplotypeRecord(String name, @Nullable String activityValue, @Nullable String functionValue, @Nullable String lookupKey) { + if (lookupKey == null) { + lookupKey = isActivityGene() ? activityValue : functionValue; + } + if (isActivityGene()) { + if (activityValue == null) { + throw new IllegalStateException("Cannot add activity gene haplotype without activity value"); + } + } else { + if (functionValue == null) { + throw new IllegalStateException("Cannot add function gene haplotype without function value"); + } + } + if (lookupKey == null) { + throw new IllegalStateException("Cannot add haplotype without lookupKey"); + } HaplotypeRecord hr = new HaplotypeRecord(name, activityValue, functionValue, lookupKey); m_namedAlleles.add(hr); m_haplotypes.put(name, hr.getLookupKey()); + if (hr.getActivityValue() != null) { + m_activityValues.put(name, hr.getActivityValue()); + } } public boolean update(String allele, @Nullable String activityScore, @Nullable String function, DataSource src) { @@ -191,7 +209,9 @@ public boolean update(String allele, @Nullable String activityScore, @Nullable S .orElse(null); if (hr == null) { // not all haplotypes will have a phenotype associated with it - return false; + System.out.println("Adding phenotype for " + m_gene + " " + allele); + addHaplotypeRecord(allele, activityScore, function, null); + return true; } boolean gotChange = false; boolean modified = false; @@ -200,11 +220,12 @@ public boolean update(String allele, @Nullable String activityScore, @Nullable S System.out.println("New " + src + " activity score for " + m_gene + " " + allele); modified = true; } else if (!hr.getActivityValue().equals(activityScore)){ - System.out.println("Overwriting " + src + " activity score for " + m_gene + " " + allele + - " (" + hr.getActivityValue() + " to " + activityScore + ")"); - modified = true; + if (Double.parseDouble(hr.getActivityValue()) != Double.parseDouble(activityScore)) { + System.out.println("Overwriting " + src + " activity score for " + m_gene + " " + allele + + " (" + hr.getActivityValue() + " to " + activityScore + ")"); + modified = true; + } } - hr.setActivityValue(activityScore); } else if (hr.getActivityValue() != null){ System.out.println("Nulling out " + src + " activity score for " + m_gene + " " + allele); modified = true; @@ -221,12 +242,11 @@ public boolean update(String allele, @Nullable String activityScore, @Nullable S if (hr.getFunctionValue() == null) { System.out.println("New " + src + " function for " + m_gene + " " + allele); modified = true; - } else if (!hr.getFunctionValue().equals(function)){ + } else if (!hr.getFunctionValue().equals(function)) { System.out.println("Overwriting " + src + " function for " + m_gene + " " + allele + " (" + hr.getFunctionValue() + " to " + function + ")"); modified = true; } - hr.setFunctionValue(function); } else if (hr.getFunctionValue() != null){ System.out.println("Nulling out " + src + " function for " + m_gene + " " + allele); modified = true; diff --git a/src/main/java/org/pharmgkb/pharmcat/phenotype/model/HaplotypeRecord.java b/src/main/java/org/pharmgkb/pharmcat/phenotype/model/HaplotypeRecord.java index dff56960..6b93d031 100644 --- a/src/main/java/org/pharmgkb/pharmcat/phenotype/model/HaplotypeRecord.java +++ b/src/main/java/org/pharmgkb/pharmcat/phenotype/model/HaplotypeRecord.java @@ -42,16 +42,20 @@ public String getActivityValue() { return activityValue; } + // needed to support subsetting public void setActivityValue(String activityValue) { this.activityValue = activityValue; + this.updateLookupKey(); } public String getFunctionValue() { return functionValue; } + // needed to support subsetting public void setFunctionValue(String functionValue) { this.functionValue = functionValue; + this.updateLookupKey(); } public String getLookupKey() { @@ -62,6 +66,10 @@ public void setLookupKey(String lookupKey) { this.lookupKey = lookupKey; } + private void updateLookupKey() { + this.lookupKey = this.activityValue == null ? this.functionValue : this.activityValue; + } + public String toFormattedFunction() { if (isUnspecified(this.activityValue)) { return this.functionValue; diff --git a/src/main/java/org/pharmgkb/pharmcat/reporter/format/HtmlFormat.java b/src/main/java/org/pharmgkb/pharmcat/reporter/format/HtmlFormat.java index feaa3ccf..eeb91d57 100644 --- a/src/main/java/org/pharmgkb/pharmcat/reporter/format/HtmlFormat.java +++ b/src/main/java/org/pharmgkb/pharmcat/reporter/format/HtmlFormat.java @@ -151,7 +151,10 @@ private Map compile(ReportContext reportContext) { } if (geneReport.isNoData()) { - noDataGenes.add(symbol); + // checking if allele definition exists for this gene because of subsetting + if (getEnv().getDefinitionReader().getGenes().contains(symbol)) { + noDataGenes.add(symbol); + } continue; } diff --git a/src/main/java/org/pharmgkb/pharmcat/subsetter/Subsetter.java b/src/main/java/org/pharmgkb/pharmcat/subsetter/Subsetter.java index 02054455..bb3a747b 100644 --- a/src/main/java/org/pharmgkb/pharmcat/subsetter/Subsetter.java +++ b/src/main/java/org/pharmgkb/pharmcat/subsetter/Subsetter.java @@ -2,7 +2,6 @@ import java.io.BufferedReader; import java.io.IOException; -import java.io.Writer; import java.lang.invoke.MethodHandles; import java.nio.charset.StandardCharsets; import java.nio.file.Files; @@ -51,7 +50,6 @@ public class Subsetter { // inputs private final Multimap m_allowList = HashMultimap.create(); - private final SortedMap> m_reportAsReference = new TreeMap<>(); private final Multimap m_functionOverrides = TreeMultimap.create(); private final Map> m_extraDefinitions = new HashMap<>(); @@ -234,7 +232,7 @@ private void writeSummary(Path file) throws IOException { if (!Files.exists(file.getParent())) { Files.createDirectories(file.getParent()); } - new SummaryWriter(m_phenotypeMap, m_geneData, m_reportAsReference).write(file); + new SummaryWriter(m_phenotypeMap, m_geneData).write(file); } @@ -252,10 +250,12 @@ private void exportDefinitionFiles(Path dir, Path defsDir) throws IOException { if (!Files.exists(defsDir)) { Files.createDirectories(defsDir); } else { - deleteObsoleteFiles(defsDir, "_translation.json", m_allowList.keySet(), "definitions"); + deleteObsoleteFiles(defsDir, ".json", m_allowList.keySet(), "definition"); + deleteObsoleteFiles(defsDir, ".vcf", m_allowList.keySet(), "definition"); + deleteObsoleteFiles(defsDir, ".bgz", m_allowList.keySet(), "definition"); + deleteObsoleteFiles(defsDir, ".csi", m_allowList.keySet(), "definition"); } - DataSerializer dataSerializer = new DataSerializer(); Set exemptions = new HashSet<>(); try (VcfHelper vcfHelper = new VcfHelper()) { for (GeneData gd : m_geneData.values()) { @@ -303,40 +303,41 @@ private void exportDefinitionFiles(Path dir, Path defsDir) throws IOException { // export Path jsonFile = defsDir.resolve(gd.gene + "_translation.json"); - dataSerializer.serializeToJson(gd.definitionFile, jsonFile); + DataSerializer.serializeToJson(gd.definitionFile, jsonFile); //System.out.println("\tWrote " + jsonFile); DefinitionExemption exemption = m_definitionReader.getExemption(gd.gene); if (exemption != null) { + if (exemption.getGene().equals("CYP2C9")) { + if (!m_geneData.containsKey("CYP4F2") || !m_geneData.containsKey("VKORC1")) { + // remove extra position only used for warfarin + exemption.getExtraPositions().stream() + .filter(p -> "rs12777823".equals(p.getRsid())) + .findAny() + .ifPresent(p -> exemption.getExtraPositions().remove(p)); + } + } exemptions.add(exemption); } } } // write definitions - Path exemptionsFile = defsDir.resolve(DataManager.EXEMPTIONS_JSON_FILE_NAME);; - dataSerializer.serializeToJson(exemptions, exemptionsFile); + Path exemptionsFile = defsDir.resolve(DataManager.EXEMPTIONS_JSON_FILE_NAME); + DataSerializer.serializeToJson(exemptions, exemptionsFile); // generate positions.vcf DataManager.exportVcfData(defsDir); - Files.copy(defsDir.resolve(DataManager.POSITIONS_VCF), + Files.move(defsDir.resolve(DataManager.POSITIONS_VCF), dir.resolve(DataManager.POSITIONS_VCF), StandardCopyOption.REPLACE_EXISTING); - Files.copy(defsDir.resolve(DataManager.POSITIONS_VCF + ".bgz"), + Files.move(defsDir.resolve(DataManager.POSITIONS_VCF + ".bgz"), dir.resolve(DataManager.POSITIONS_VCF + ".bgz"), StandardCopyOption.REPLACE_EXISTING); - Files.copy(defsDir.resolve(DataManager.POSITIONS_VCF + ".bgz.csi"), + Files.move(defsDir.resolve(DataManager.POSITIONS_VCF + ".bgz.csi"), dir.resolve(DataManager.POSITIONS_VCF + ".bgz.csi"), StandardCopyOption.REPLACE_EXISTING); - Files.copy(defsDir.resolve(DataManager.UNIALLELIC_POSITIONS_VCF + ".bgz"), + Files.move(defsDir.resolve(DataManager.UNIALLELIC_POSITIONS_VCF + ".bgz"), dir.resolve(DataManager.UNIALLELIC_POSITIONS_VCF + ".bgz"), StandardCopyOption.REPLACE_EXISTING); - Files.copy(defsDir.resolve(DataManager.UNIALLELIC_POSITIONS_VCF + ".bgz.csi"), + Files.move(defsDir.resolve(DataManager.UNIALLELIC_POSITIONS_VCF + ".bgz.csi"), dir.resolve(DataManager.UNIALLELIC_POSITIONS_VCF + ".bgz.csi"), StandardCopyOption.REPLACE_EXISTING); - - if (!m_reportAsReference.isEmpty()) { - Path file = defsDir.resolve("reportAsReference.json"); - System.out.println("Saving report-as-reference data in " + file); - try (Writer writer = Files.newBufferedWriter(file)) { - DataSerializer.GSON.toJson(m_reportAsReference, writer); - } - } } private void exportPhenotypes(Path dir) throws IOException { @@ -365,8 +366,8 @@ private void exportPhenotypes(Path dir) throws IOException { deleteObsoleteFiles(cpicDir, ".json", genes, "CPIC phenotypes"); } - DataSerializer dataSerializer = new DataSerializer(); - SortedSet modified = new TreeSet<>(writePhenotypes(genes, m_phenotypeMap.getCpicGenes(), cpicDir, dataSerializer, DataSource.CPIC)); + SortedSet modified = new TreeSet<>(writePhenotypes(genes, m_phenotypeMap.getCpicGenes(), cpicDir, + DataSource.CPIC)); /* Path dpwgDir = dir.resolve("dpwg"); @@ -385,10 +386,11 @@ private void exportPhenotypes(Path dir) throws IOException { } private Set writePhenotypes(Collection genes, Collection phenotypes, Path dir, - DataSerializer dataSerializer, DataSource src) throws IOException { + DataSource src) throws IOException { Set changed = new HashSet<>(); for (GenePhenotype gp : phenotypes) { if (genes.contains(gp.getGene())) { + boolean updatedFunctions = false; if (m_functionOverrides.containsKey(gp.getGene())) { GeneData geneData = m_geneData.get(gp.getGene()); // geneData can be null for outside-call only genes (e.g. CYP2D6) @@ -403,18 +405,25 @@ private Set writePhenotypes(Collection genes, Collection 2) { String mod = StringUtils.stripToNull(data[2]); if (mod != null) { - if (mod.equalsIgnoreCase("report as *1")) { - m_reportAsReference.computeIfAbsent(gene, g -> new TreeSet<>()) - .add(allele); - } else { - System.out.println("Don't know what to do with '" + mod + "' for " + gene + " " + allele); - } + System.out.println("Don't know what to do with '" + mod + "' for " + gene + " " + allele); } } @@ -711,12 +717,7 @@ private void parseAlleleAllowListXls(Path xlsxFile) throws IOException { if (modCell != null) { String mod = StringUtils.stripToNull(modCell.getStringCellValue()); if (mod != null) { - if (mod.equalsIgnoreCase("report as *1")) { - m_reportAsReference.computeIfAbsent(gene, g -> new TreeSet<>()) - .add(allele); - } else { - System.out.println("Don't know what to do with '" + mod + "' for " + gene + " " + allele); - } + System.out.println("Don't know what to do with '" + mod + "' for " + gene + " " + allele); } } } @@ -753,9 +754,11 @@ private void parseFunctionOverrides(Path xlsxFile) throws IOException { if (activityCell.getCellType() == CellType.NUMERIC) { double val = activityCell.getNumericCellValue(); if (val == 1) { - activity = "1"; + activity = "1.0"; } else if (val == 0) { - activity = "0"; + activity = "0.0"; + } else if (val == 2) { + activity = "2.0"; } else { activity = Double.toString(val); } diff --git a/src/main/java/org/pharmgkb/pharmcat/subsetter/SummaryWriter.java b/src/main/java/org/pharmgkb/pharmcat/subsetter/SummaryWriter.java index c24eac83..85c536bb 100644 --- a/src/main/java/org/pharmgkb/pharmcat/subsetter/SummaryWriter.java +++ b/src/main/java/org/pharmgkb/pharmcat/subsetter/SummaryWriter.java @@ -40,16 +40,13 @@ private enum HeaderType { DESC, RSID, CHR_POS, SUBSECTION } private final PhenotypeMap m_phenotypeMap; private final SortedMap m_geneData; - private final SortedMap> m_reportAsReference; private final List m_cellStyles = new ArrayList<>(); - public SummaryWriter(PhenotypeMap phenotypeMap, SortedMap geneData, - SortedMap> reportAsReference) { + public SummaryWriter(PhenotypeMap phenotypeMap, SortedMap geneData) { m_phenotypeMap = phenotypeMap; m_geneData = geneData; - m_reportAsReference = reportAsReference; } @@ -235,9 +232,6 @@ private int writePositionHeader(Row row, int colNum, SortedSet var private void writeHaplotype(Row row, GeneData gd, NamedAllele hap) { CellStyle cellStyle = null; - if (m_reportAsReference.containsKey(gd.gene) && m_reportAsReference.get(gd.gene).contains(hap.getName())) { - cellStyle = m_cellStyles.get(CELL_STYLE_REPORT_AS_REF); - } writeCell(row, 0, hap.getName(), cellStyle); writeFunction(row, gd.gene, hap, DataSource.CPIC); writeFunction(row, gd.gene, hap, DataSource.DPWG);