Skip to content

Commit

Permalink
fix(subsetter): fix bugs in subsetter
Browse files Browse the repository at this point in the history
  • Loading branch information
markwoon committed Aug 14, 2024
1 parent 0b4a4a9 commit 683a361
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public class GenePhenotype {
private Map<String, String> m_haplotypes;
@Expose
@SerializedName("activityValues")
private Map<String,String> m_activityValues = new HashMap<>();
private Map<String, String> m_activityValues = new HashMap<>();
@Expose
@SerializedName("diplotypes")
private SortedSet<DiplotypeRecord> m_diplotypes;
Expand Down Expand Up @@ -179,9 +179,27 @@ public String toString() {

public void addHaplotypeRecord(String name, @Nullable String activityValue, @Nullable String functionValue,
@Nullable String lookupKey) {
if (lookupKey == null) {
lookupKey = isActivityGene() ? activityValue : functionValue;
}
if (isActivityGene()) {
if (activityValue == null) {
throw new IllegalStateException("Cannot add activity gene haplotype without activity value");
}
} else {
if (functionValue == null) {
throw new IllegalStateException("Cannot add function gene haplotype without function value");
}
}
if (lookupKey == null) {
throw new IllegalStateException("Cannot add haplotype without lookupKey");
}
HaplotypeRecord hr = new HaplotypeRecord(name, activityValue, functionValue, lookupKey);
m_namedAlleles.add(hr);
m_haplotypes.put(name, hr.getLookupKey());
if (hr.getActivityValue() != null) {
m_activityValues.put(name, hr.getActivityValue());
}
}

public boolean update(String allele, @Nullable String activityScore, @Nullable String function, DataSource src) {
Expand All @@ -191,7 +209,9 @@ public boolean update(String allele, @Nullable String activityScore, @Nullable S
.orElse(null);
if (hr == null) {
// not all haplotypes will have a phenotype associated with it
return false;
System.out.println("Adding phenotype for " + m_gene + " " + allele);
addHaplotypeRecord(allele, activityScore, function, null);
return true;
}
boolean gotChange = false;
boolean modified = false;
Expand All @@ -200,11 +220,12 @@ public boolean update(String allele, @Nullable String activityScore, @Nullable S
System.out.println("New " + src + " activity score for " + m_gene + " " + allele);
modified = true;
} else if (!hr.getActivityValue().equals(activityScore)){
System.out.println("Overwriting " + src + " activity score for " + m_gene + " " + allele +
" (" + hr.getActivityValue() + " to " + activityScore + ")");
modified = true;
if (Double.parseDouble(hr.getActivityValue()) != Double.parseDouble(activityScore)) {
System.out.println("Overwriting " + src + " activity score for " + m_gene + " " + allele +
" (" + hr.getActivityValue() + " to " + activityScore + ")");
modified = true;
}
}
hr.setActivityValue(activityScore);
} else if (hr.getActivityValue() != null){
System.out.println("Nulling out " + src + " activity score for " + m_gene + " " + allele);
modified = true;
Expand All @@ -221,12 +242,11 @@ public boolean update(String allele, @Nullable String activityScore, @Nullable S
if (hr.getFunctionValue() == null) {
System.out.println("New " + src + " function for " + m_gene + " " + allele);
modified = true;
} else if (!hr.getFunctionValue().equals(function)){
} else if (!hr.getFunctionValue().equals(function)) {
System.out.println("Overwriting " + src + " function for " + m_gene + " " + allele + " (" +
hr.getFunctionValue() + " to " + function + ")");
modified = true;
}
hr.setFunctionValue(function);
} else if (hr.getFunctionValue() != null){
System.out.println("Nulling out " + src + " function for " + m_gene + " " + allele);
modified = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,20 @@ public String getActivityValue() {
return activityValue;
}

// needed to support subsetting
public void setActivityValue(String activityValue) {
this.activityValue = activityValue;
this.updateLookupKey();
}

public String getFunctionValue() {
return functionValue;
}

// needed to support subsetting
public void setFunctionValue(String functionValue) {
this.functionValue = functionValue;
this.updateLookupKey();
}

public String getLookupKey() {
Expand All @@ -62,6 +66,10 @@ public void setLookupKey(String lookupKey) {
this.lookupKey = lookupKey;
}

private void updateLookupKey() {
this.lookupKey = this.activityValue == null ? this.functionValue : this.activityValue;
}

public String toFormattedFunction() {
if (isUnspecified(this.activityValue)) {
return this.functionValue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,10 @@ private Map<String,Object> compile(ReportContext reportContext) {
}

if (geneReport.isNoData()) {
noDataGenes.add(symbol);
// checking if allele definition exists for this gene because of subsetting
if (getEnv().getDefinitionReader().getGenes().contains(symbol)) {
noDataGenes.add(symbol);
}
continue;
}

Expand Down
89 changes: 46 additions & 43 deletions src/main/java/org/pharmgkb/pharmcat/subsetter/Subsetter.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Writer;
import java.lang.invoke.MethodHandles;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
Expand Down Expand Up @@ -51,7 +50,6 @@
public class Subsetter {
// inputs
private final Multimap<String, String> m_allowList = HashMultimap.create();
private final SortedMap<String, SortedSet<String>> m_reportAsReference = new TreeMap<>();
private final Multimap<String, FunctionData> m_functionOverrides = TreeMultimap.create();
private final Map<String, Map<String, NamedAllele>> m_extraDefinitions = new HashMap<>();

Expand Down Expand Up @@ -234,7 +232,7 @@ private void writeSummary(Path file) throws IOException {
if (!Files.exists(file.getParent())) {
Files.createDirectories(file.getParent());
}
new SummaryWriter(m_phenotypeMap, m_geneData, m_reportAsReference).write(file);
new SummaryWriter(m_phenotypeMap, m_geneData).write(file);
}


Expand All @@ -252,10 +250,12 @@ private void exportDefinitionFiles(Path dir, Path defsDir) throws IOException {
if (!Files.exists(defsDir)) {
Files.createDirectories(defsDir);
} else {
deleteObsoleteFiles(defsDir, "_translation.json", m_allowList.keySet(), "definitions");
deleteObsoleteFiles(defsDir, ".json", m_allowList.keySet(), "definition");
deleteObsoleteFiles(defsDir, ".vcf", m_allowList.keySet(), "definition");
deleteObsoleteFiles(defsDir, ".bgz", m_allowList.keySet(), "definition");
deleteObsoleteFiles(defsDir, ".csi", m_allowList.keySet(), "definition");
}

DataSerializer dataSerializer = new DataSerializer();
Set<DefinitionExemption> exemptions = new HashSet<>();
try (VcfHelper vcfHelper = new VcfHelper()) {
for (GeneData gd : m_geneData.values()) {
Expand Down Expand Up @@ -303,40 +303,41 @@ private void exportDefinitionFiles(Path dir, Path defsDir) throws IOException {

// export
Path jsonFile = defsDir.resolve(gd.gene + "_translation.json");
dataSerializer.serializeToJson(gd.definitionFile, jsonFile);
DataSerializer.serializeToJson(gd.definitionFile, jsonFile);
//System.out.println("\tWrote " + jsonFile);

DefinitionExemption exemption = m_definitionReader.getExemption(gd.gene);
if (exemption != null) {
if (exemption.getGene().equals("CYP2C9")) {
if (!m_geneData.containsKey("CYP4F2") || !m_geneData.containsKey("VKORC1")) {
// remove extra position only used for warfarin
exemption.getExtraPositions().stream()
.filter(p -> "rs12777823".equals(p.getRsid()))
.findAny()
.ifPresent(p -> exemption.getExtraPositions().remove(p));
}
}
exemptions.add(exemption);
}
}
}

// write definitions
Path exemptionsFile = defsDir.resolve(DataManager.EXEMPTIONS_JSON_FILE_NAME);;
dataSerializer.serializeToJson(exemptions, exemptionsFile);
Path exemptionsFile = defsDir.resolve(DataManager.EXEMPTIONS_JSON_FILE_NAME);
DataSerializer.serializeToJson(exemptions, exemptionsFile);
// generate positions.vcf
DataManager.exportVcfData(defsDir);
Files.copy(defsDir.resolve(DataManager.POSITIONS_VCF),
Files.move(defsDir.resolve(DataManager.POSITIONS_VCF),
dir.resolve(DataManager.POSITIONS_VCF), StandardCopyOption.REPLACE_EXISTING);
Files.copy(defsDir.resolve(DataManager.POSITIONS_VCF + ".bgz"),
Files.move(defsDir.resolve(DataManager.POSITIONS_VCF + ".bgz"),
dir.resolve(DataManager.POSITIONS_VCF + ".bgz"), StandardCopyOption.REPLACE_EXISTING);
Files.copy(defsDir.resolve(DataManager.POSITIONS_VCF + ".bgz.csi"),
Files.move(defsDir.resolve(DataManager.POSITIONS_VCF + ".bgz.csi"),
dir.resolve(DataManager.POSITIONS_VCF + ".bgz.csi"), StandardCopyOption.REPLACE_EXISTING);

Files.copy(defsDir.resolve(DataManager.UNIALLELIC_POSITIONS_VCF + ".bgz"),
Files.move(defsDir.resolve(DataManager.UNIALLELIC_POSITIONS_VCF + ".bgz"),
dir.resolve(DataManager.UNIALLELIC_POSITIONS_VCF + ".bgz"), StandardCopyOption.REPLACE_EXISTING);
Files.copy(defsDir.resolve(DataManager.UNIALLELIC_POSITIONS_VCF + ".bgz.csi"),
Files.move(defsDir.resolve(DataManager.UNIALLELIC_POSITIONS_VCF + ".bgz.csi"),
dir.resolve(DataManager.UNIALLELIC_POSITIONS_VCF + ".bgz.csi"), StandardCopyOption.REPLACE_EXISTING);

if (!m_reportAsReference.isEmpty()) {
Path file = defsDir.resolve("reportAsReference.json");
System.out.println("Saving report-as-reference data in " + file);
try (Writer writer = Files.newBufferedWriter(file)) {
DataSerializer.GSON.toJson(m_reportAsReference, writer);
}
}
}

private void exportPhenotypes(Path dir) throws IOException {
Expand Down Expand Up @@ -365,8 +366,8 @@ private void exportPhenotypes(Path dir) throws IOException {
deleteObsoleteFiles(cpicDir, ".json", genes, "CPIC phenotypes");
}

DataSerializer dataSerializer = new DataSerializer();
SortedSet<String> modified = new TreeSet<>(writePhenotypes(genes, m_phenotypeMap.getCpicGenes(), cpicDir, dataSerializer, DataSource.CPIC));
SortedSet<String> modified = new TreeSet<>(writePhenotypes(genes, m_phenotypeMap.getCpicGenes(), cpicDir,
DataSource.CPIC));

/*
Path dpwgDir = dir.resolve("dpwg");
Expand All @@ -385,10 +386,11 @@ private void exportPhenotypes(Path dir) throws IOException {
}

private Set<String> writePhenotypes(Collection<String> genes, Collection<GenePhenotype> phenotypes, Path dir,
DataSerializer dataSerializer, DataSource src) throws IOException {
DataSource src) throws IOException {
Set<String> changed = new HashSet<>();
for (GenePhenotype gp : phenotypes) {
if (genes.contains(gp.getGene())) {
boolean updatedFunctions = false;
if (m_functionOverrides.containsKey(gp.getGene())) {
GeneData geneData = m_geneData.get(gp.getGene());
// geneData can be null for outside-call only genes (e.g. CYP2D6)
Expand All @@ -403,18 +405,25 @@ private Set<String> writePhenotypes(Collection<String> genes, Collection<GenePhe
System.out.println("WARNING: extra definition for " + na.getName() + " but no function was specified");
continue;
}
System.out.println("New named allele: " + gp.getGene() + " " + na.getName());
updatedFunctions = true;
gp.addHaplotypeRecord(na.getName(), fd.activityScore, fd.function, null);
}
}

for (FunctionData fd : m_functionOverrides.get(gp.getGene())) {
gp.update(fd.allele, fd.activityScore, fd.function, src);
if (gp.update(fd.allele, fd.activityScore, fd.function, src)) {
updatedFunctions = true;
}
}
changed.add(gp.getGene());
}
// export
Path jsonFile = dir.resolve(gp.getGene() + ".json");
dataSerializer.serializeToJson(gp, jsonFile);
if (updatedFunctions) {
gp.generateDiplotypes();
}
DataSerializer.serializeToJson(gp, jsonFile);
//System.out.println("\tWrote " + jsonFile);
}
}
Expand Down Expand Up @@ -478,13 +487,13 @@ public static void main(String[] args) {
}

Path dataDir = cliHelper.getValidDirectory("i", true);
Path baseDefDir = dataDir.resolve("definitions");
Path baseDefDir = dataDir.resolve("definition");
if (cliHelper.hasOption("d") || cliHelper.hasOption("pos") || cliHelper.hasOption("a")) {
if (!Files.isDirectory(baseDefDir)) {
System.out.println("Cannot find 'definitions' subdirectory in " + dataDir);
}
}
Path basePhenoDir = dataDir.resolve("phenotypes");
Path basePhenoDir = dataDir.resolve("phenotype");
if (cliHelper.hasOption("pc")) {
if (!Files.isDirectory(basePhenoDir)) {
System.out.println("Cannot find 'phenotypes' subdirectory in " + dataDir);
Expand Down Expand Up @@ -524,7 +533,9 @@ public static void main(String[] args) {
subsetter.parseFunctionOverrides(file);
}

boolean updatedDefinitions = subsetter.updateDefinitions();
if (!subsetter.updateDefinitions()) {
System.out.println("No definitions updated.");
}

if (cliHelper.hasOption("o")) {
Path outDir = cliHelper.getValidDirectory("o", true);
Expand Down Expand Up @@ -656,12 +667,7 @@ private void parseAlleleAllowListTsv(Path tsvFile) throws IOException {
if (data.length > 2) {
String mod = StringUtils.stripToNull(data[2]);
if (mod != null) {
if (mod.equalsIgnoreCase("report as *1")) {
m_reportAsReference.computeIfAbsent(gene, g -> new TreeSet<>())
.add(allele);
} else {
System.out.println("Don't know what to do with '" + mod + "' for " + gene + " " + allele);
}
System.out.println("Don't know what to do with '" + mod + "' for " + gene + " " + allele);
}
}

Expand Down Expand Up @@ -711,12 +717,7 @@ private void parseAlleleAllowListXls(Path xlsxFile) throws IOException {
if (modCell != null) {
String mod = StringUtils.stripToNull(modCell.getStringCellValue());
if (mod != null) {
if (mod.equalsIgnoreCase("report as *1")) {
m_reportAsReference.computeIfAbsent(gene, g -> new TreeSet<>())
.add(allele);
} else {
System.out.println("Don't know what to do with '" + mod + "' for " + gene + " " + allele);
}
System.out.println("Don't know what to do with '" + mod + "' for " + gene + " " + allele);
}
}
}
Expand Down Expand Up @@ -753,9 +754,11 @@ private void parseFunctionOverrides(Path xlsxFile) throws IOException {
if (activityCell.getCellType() == CellType.NUMERIC) {
double val = activityCell.getNumericCellValue();
if (val == 1) {
activity = "1";
activity = "1.0";
} else if (val == 0) {
activity = "0";
activity = "0.0";
} else if (val == 2) {
activity = "2.0";
} else {
activity = Double.toString(val);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,13 @@ private enum HeaderType { DESC, RSID, CHR_POS, SUBSECTION }

private final PhenotypeMap m_phenotypeMap;
private final SortedMap<String, GeneData> m_geneData;
private final SortedMap<String, SortedSet<String>> m_reportAsReference;

private final List<CellStyle> m_cellStyles = new ArrayList<>();


public SummaryWriter(PhenotypeMap phenotypeMap, SortedMap<String, GeneData> geneData,
SortedMap<String, SortedSet<String>> reportAsReference) {
public SummaryWriter(PhenotypeMap phenotypeMap, SortedMap<String, GeneData> geneData) {
m_phenotypeMap = phenotypeMap;
m_geneData = geneData;
m_reportAsReference = reportAsReference;
}


Expand Down Expand Up @@ -235,9 +232,6 @@ private int writePositionHeader(Row row, int colNum, SortedSet<VariantLocus> var

private void writeHaplotype(Row row, GeneData gd, NamedAllele hap) {
CellStyle cellStyle = null;
if (m_reportAsReference.containsKey(gd.gene) && m_reportAsReference.get(gd.gene).contains(hap.getName())) {
cellStyle = m_cellStyles.get(CELL_STYLE_REPORT_AS_REF);
}
writeCell(row, 0, hap.getName(), cellStyle);
writeFunction(row, gd.gene, hap, DataSource.CPIC);
writeFunction(row, gd.gene, hap, DataSource.DPWG);
Expand Down

0 comments on commit 683a361

Please sign in to comment.