Skip to content

Commit

Permalink
fix(datamanager): handle all allele/position removal during data inge…
Browse files Browse the repository at this point in the history
…stion phase
  • Loading branch information
markwoon committed Apr 28, 2024
1 parent 5ecaa74 commit 85a770f
Show file tree
Hide file tree
Showing 9 changed files with 144 additions and 139 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ public DefinitionReader(List<Path> definitionFiles, @Nullable Path exemptionsFil
* This should be called <em>after</em> all allele definitions have been read.
*/
public String getGenomeBuild() {
Preconditions.checkState(m_definitionFiles.size() > 0);
Preconditions.checkState(!m_definitionFiles.isEmpty());

if (m_genomeBuild == null) {
for (DefinitionFile definitionFile : m_definitionFiles.values()) {
Expand Down Expand Up @@ -143,6 +143,10 @@ public SortedSet<NamedAllele> getHaplotypes(String gene) {
return m_definitionFiles.get(gene).getNamedAlleles();
}

public NamedAllele getReferenceHaplotype(String gene) {
return m_definitionFiles.get(gene).getReferenceNamedAllele();
}

public @Nullable DefinitionExemption getExemption(String gene) {
return m_exemptions.get(gene.toLowerCase());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ public DefinitionExemption(String gene, @Nullable SortedSet<VariantLocus> ignore
m_ignoredAllelesLc = m_ignoredAlleles;
} else {
m_ignoredAlleles = ignoredAlleles;
m_ignoredAllelesLc = ignoredAlleles.stream().map(String::toLowerCase).collect(Collectors.toCollection(TreeSet::new));
m_ignoredAllelesLc = ignoredAlleles.stream()
.map(String::toLowerCase)
.collect(Collectors.toCollection(TreeSet::new));
}
m_allHits = allHits;
}
Expand All @@ -61,7 +63,9 @@ public String getGene() {


/**
* Gets the positions from definition that to ignore.
* Gets the positions from original definition that should be ignored.
* These get removed by the {@link org.pharmgkb.pharmcat.util.DataManager} when definitions are first pulled back from
* PharmGKB.
*/
public SortedSet<VariantLocus> getIgnoredPositions() {
return m_ignoredPositions;
Expand All @@ -72,7 +76,7 @@ public SortedSet<VariantLocus> getIgnoredPositions() {
* <p>
* <b>Currently only checks based on RSID!</b>
*/
public boolean shouldIgnorePosition(VariantLocus position) {
boolean shouldIgnorePosition(VariantLocus position) {
return m_ignoredPositions.stream()
.anyMatch(vl -> {
if (vl.getRsid() != null) {
Expand All @@ -92,7 +96,9 @@ public SortedSet<VariantLocus> getExtraPositions() {


/**
* Gets the named alleles to ignore.
* Gets the named alleles from the original definition that should be ignored.
* These get removed by the {@link org.pharmgkb.pharmcat.util.DataManager} when definitions are first pulled back
* from PharmGKB.
*/
public SortedSet<String> getIgnoredAlleles() {
return m_ignoredAlleles;
Expand All @@ -101,7 +107,7 @@ public SortedSet<String> getIgnoredAlleles() {
/**
* Checks if the given named allele should be ignored.
*/
public boolean shouldIgnoreAllele(String allele) {
boolean shouldIgnoreAllele(String allele) {
return m_ignoredAllelesLc.contains(allele.toLowerCase());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import java.util.stream.Collectors;
import com.google.gson.annotations.Expose;
import com.google.gson.annotations.SerializedName;
import org.checkerframework.checker.nullness.qual.Nullable;
import org.pharmgkb.pharmcat.ParseException;
import org.pharmgkb.pharmcat.haplotype.Iupac;
import org.pharmgkb.pharmcat.reporter.model.DataSource;
Expand Down Expand Up @@ -57,6 +58,11 @@ public class DefinitionFile {
@SerializedName("namedAlleles")
private SortedSet<NamedAllele> m_namedAlleles;

//-- cache
private Map<String, NamedAllele> m_namedalleleMap;
private NamedAllele m_referenceNamedAllele;



/**
* The format version of the definition file.
Expand Down Expand Up @@ -145,6 +151,39 @@ public SortedSet<NamedAllele> getNamedAlleles() {
return m_namedAlleles;
}

public @Nullable NamedAllele getNamedAllele(String name) {
if (m_namedalleleMap == null) {
mapNamedAlleles();
}
return m_namedalleleMap.get(name);
}

public NamedAllele getReferenceNamedAllele() {
if (m_referenceNamedAllele == null) {
mapNamedAlleles();
}
return m_referenceNamedAllele;
}

private void mapNamedAlleles() {
if (m_namedalleleMap == null) {
m_namedalleleMap = new HashMap<>();
for (NamedAllele allele : getNamedAlleles()) {
m_namedalleleMap.put(allele.getName(), allele);
if (allele.isReference()) {
if (m_referenceNamedAllele != null) {
throw new IllegalStateException("Multiple reference named alleles: " + allele.getName() + " and " +
m_referenceNamedAllele.getName());
}
m_referenceNamedAllele = allele;
}
}
if (m_referenceNamedAllele == null) {
throw new IllegalStateException(m_geneSymbol + " has no reference named allele!");
}
}
}


@Override
public String toString() {
Expand Down Expand Up @@ -191,44 +230,81 @@ public void removeIgnoredNamedAlleles(DefinitionExemption exemption) {


/**
* Remove ignored positions specified in {@link DefinitionExemption}.
* Remove ignored positions specified in {@link DefinitionExemption} and any unused positions.
* Should only be called during initial generation of this {@link DefinitionFile} by {@link DataManager}.
*/
public void removeIgnoredPositions(DefinitionExemption exemption) {
// find ignored positions
Set<Integer> ignoredPositions = new HashSet<>();

// cannot use helper methods on NamedAlleles because they're not initialized yet
// must loop through elements manually

// find unused positions due to ignored NamedAlleles
SortedSet<VariantLocus> unusedPositions = new TreeSet<>();
for (int x = 0; x < m_variants.length; x += 1) {
boolean inUse = false;
for (NamedAllele na : m_namedAlleles) {
if (na.getCpicAlleles()[x] != null) {
inUse = true;
break;
}
}
if (!inUse) {
System.out.println(" Found unused position: " + m_variants[x]);
unusedPositions.add(m_variants[x]);
}
}

// remove unused/ignored positions
int numIgnored = 0;
int numUnused = 0;
Set<Integer> skipPositions = new HashSet<>();
List<VariantLocus> newVariants = new ArrayList<>();
for (int x = 0; x < m_variants.length; x += 1) {
if (exemption.shouldIgnorePosition(m_variants[x])) {
System.out.println(" Removing position " + x + " (" + m_variants[x] + ")");
ignoredPositions.add(x);
System.out.println(" Removing ignored position " + x + " (" + m_variants[x] + ")");
skipPositions.add(x);
numIgnored += 1;
} else if (unusedPositions.contains(m_variants[x])) {
System.out.println(" Removing unused position " + x + " (" + m_variants[x] + ")");
skipPositions.add(x);
numUnused += 1;
} else {
newVariants.add(m_variants[x]);
}
}
if (exemption.getIgnoredPositions().size() != ignoredPositions.size()) {
if (exemption.getIgnoredPositions().size() != numIgnored) {
throw new IllegalStateException("Should have " + exemption.getIgnoredPositions().size() + " ignored positions, " +
"but only found " + ignoredPositions.size());
"but only found " + numIgnored);
}
if (unusedPositions.size() != numUnused) {
throw new IllegalStateException("Should have " + unusedPositions.size() + " unused positions, but only found " +
numUnused);
}
// update variants
m_variants = newVariants.toArray(new VariantLocus[0]);

SortedSet<NamedAllele> updatedNamedAlleles = new TreeSet<>();
for (NamedAllele namedAllele : m_namedAlleles) {
String[] cpicAlleles = new String[namedAllele.getCpicAlleles().length - ignoredPositions.size()];
if (m_variants.length != cpicAlleles.length) {
// sanity check
int totalAlleles = namedAllele.getCpicAlleles().length - skipPositions.size();
if (m_variants.length != namedAllele.getCpicAlleles().length - skipPositions.size()) {
throw new IllegalStateException("Number of variants (" + m_variants.length + ") and number of CPIC alleles (" +
cpicAlleles.length + ") don't match up for " + namedAllele.getName());
totalAlleles + ") don't match up for " + namedAllele.getName());
}
String[] cpicAlleles = new String[totalAlleles];
for (int x = 0, y = 0; x < namedAllele.getCpicAlleles().length; x += 1) {
if (ignoredPositions.contains(x)) {
if (skipPositions.contains(x)) {
continue;
}
cpicAlleles[y] = namedAllele.getCpicAlleles()[x];
y += 1;
}
// if there's nothing left that differs from reference allele then don't include this named allele in output
if (!Arrays.stream(cpicAlleles).allMatch(Objects::isNull)) {

// if there's nothing left that differs from reference allele, then don't include this named allele in output
if (Arrays.stream(cpicAlleles).allMatch(Objects::isNull)) {
System.out.println("WARNING: Removing " + namedAllele.getName() +
" because it has no alleles after removing unused/ignored positions");
} else {
updatedNamedAlleles.add(new NamedAllele(namedAllele.getId(), namedAllele.getName(), null, cpicAlleles,
namedAllele.isReference()));
}
Expand All @@ -239,6 +315,7 @@ public void removeIgnoredPositions(DefinitionExemption exemption) {

/**
* Translate variants from CPIC to VCF (i.e. {@code cpicAlleles} to {@code alleles}).
* Should only be called during initial generation of this {@link DefinitionFile} by {@link DataManager}.
*/
public void doVcfTranslation(VcfHelper vcfHelper) throws IOException {

Expand Down Expand Up @@ -332,7 +409,7 @@ private void translateVariantLocus(NamedAllele referenceNamedAllele, VariantLocu
altAlleles.add(allele);
}
}
if (repeats.size() > 0 && repeats.size() != vl.getCpicAlleles().size()) {
if (!repeats.isEmpty() && repeats.size() != vl.getCpicAlleles().size()) {
boolean haveSingle = false;
if (nonRepeats.size() == 1) {
String repeatedSequence = repeats.get(0);
Expand All @@ -347,7 +424,7 @@ private void translateVariantLocus(NamedAllele referenceNamedAllele, VariantLocu

List<String> hgvsNames = VariantLocus.HGVS_NAME_SPLITTER.splitToList(vl.getChromosomeHgvsName());

if (!isSnp && repeats.size() == 0 && altAlleles.size() != 1) {
if (!isSnp && repeats.isEmpty() && altAlleles.size() != 1) {
// in/dels - must have HGVS to represent each change
throw new IllegalStateException(errorLocation + ": has " + altAlleles.size() + " alt alleles; max is 1");
}
Expand Down Expand Up @@ -392,7 +469,7 @@ private void translateVariantLocus(NamedAllele referenceNamedAllele, VariantLocu
" vs. " + vcfPosition);
}

} else if (repeats.size() > 0) {
} else if (!repeats.isEmpty()) {
Map<String, VcfHelper.VcfData> firstPass = new HashMap<>();
for (String h : hgvsNames) {
String repeatAlt;
Expand Down Expand Up @@ -467,12 +544,12 @@ private void translateVariantLocus(NamedAllele referenceNamedAllele, VariantLocu
}
}

if (missingAlts.size() > 0) {
if (!missingAlts.isEmpty()) {
if (altAlleles.size() == 1) {
throw new IllegalStateException(errorLocation + ": Missing alts " + missingAlts);
} else {
if (!vcfMap.entrySet().stream().allMatch((e) -> e.getKey().equals(e.getValue()))) {
// CPIC alleles needs to be translated
// CPIC alleles need to be translated
throw new IllegalStateException(errorLocation + ": Don't know how to translate " + missingAlts);
} else {
// no translation, use as is
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import com.google.gson.annotations.SerializedName;
import org.apache.commons.lang3.ObjectUtils;
import org.checkerframework.checker.nullness.qual.NonNull;
import org.checkerframework.checker.nullness.qual.Nullable;
import org.pharmgkb.pharmcat.haplotype.Iupac;
import org.pharmgkb.pharmcat.haplotype.MatchData;
import org.pharmgkb.pharmcat.util.HaplotypeNameComparator;
Expand Down Expand Up @@ -249,7 +250,7 @@ public String getAllele(int idx) {
return m_alleles[idx];
}

public String getAllele(VariantLocus variantLocus) {
public @Nullable String getAllele(VariantLocus variantLocus) {
Preconditions.checkState(m_isInitialized, "This NamedAllele has not been initialized()");
return m_alleleMap.get(variantLocus);
}
Expand All @@ -263,7 +264,7 @@ public String getCpicAllele(int x) {
return m_cpicAlleles[x];
}

public String getCpicAllele(VariantLocus variantLocus) {
public @Nullable String getCpicAllele(VariantLocus variantLocus) {
Preconditions.checkState(m_isInitialized, "This NamedAllele has not been initialized()");
return m_cpicAlleleMap.get(variantLocus);
}
Expand Down
25 changes: 16 additions & 9 deletions src/main/java/org/pharmgkb/pharmcat/haplotype/MatchData.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
package org.pharmgkb.pharmcat.haplotype;

import java.lang.invoke.MethodHandles;
import java.util.*;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.stream.Collectors;
import com.google.common.base.Preconditions;
import com.google.gson.annotations.Expose;
Expand Down Expand Up @@ -62,7 +72,7 @@ public class MatchData {
* @param alleleMap map of chr:positions to {@link SampleAllele}s from VCF
* @param allPositions all {@link VariantLocus} positions of interest for the gene
* @param extraPositions extra positions to track sample alleles for
* @param ignoredPositions ignored positions due to ignored named alleles
* @param ignoredPositions ignored positions to remove from matching (used for special cases like DPYD)
*/
public MatchData(String sampleId, String gene, SortedMap<String, SampleAllele> alleleMap, VariantLocus[] allPositions,
@Nullable SortedSet<VariantLocus> extraPositions, @Nullable SortedSet<VariantLocus> ignoredPositions) {
Expand All @@ -84,7 +94,7 @@ public MatchData(String sampleId, String gene, SortedMap<String, SampleAllele> a
if (m_ignoredPositions.contains(variant)) {
continue;
}
if (allele.getUndocumentedVariations().size() > 0) {
if (!allele.getUndocumentedVariations().isEmpty()) {
m_positionsWithUndocumentedVariations.add(variant);
if (allele.isTreatUndocumentedVariationsAsReference()) {
m_treatUndocumentedVariationsAsReference = true;
Expand Down Expand Up @@ -175,11 +185,8 @@ private boolean isIgnorableCombination(String gene, NamedAllele hap) {
void defaultMissingAllelesToReference() {

SortedSet<NamedAllele> updatedHaplotypes = new TreeSet<>();
Optional<NamedAllele> refHapOpt = m_haplotypes.stream().filter(NamedAllele::isReference).findAny();
if (refHapOpt.isEmpty()) {
throw new IllegalStateException(m_gene + " does not have a reference");
}
NamedAllele referenceHaplotype = refHapOpt.get();
NamedAllele referenceHaplotype = m_haplotypes.stream().filter(NamedAllele::isReference).findAny()
.orElseThrow(() -> new IllegalStateException(m_gene + " does not have a reference"));
int numAlleles = referenceHaplotype.getAlleles().length;
for (NamedAllele hap : m_haplotypes) {
if (referenceHaplotype == hap) {
Expand Down Expand Up @@ -319,7 +326,7 @@ public SortedSet<Variant> getExtraPositions() {
*/
public SortedSet<NamedAllele> getHaplotypes() {
if (m_haplotypes == null) {
if (m_sampleMap.size() == 0) {
if (m_sampleMap.isEmpty()) {
return Collections.emptySortedSet();
}
throw new IllegalStateException("Not initialized - call marshallHaplotypes()");
Expand Down
Loading

0 comments on commit 85a770f

Please sign in to comment.