Skip to content

Commit

Permalink
Address reviewer comments; greatly simplify clustering; add ctx/cpx s…
Browse files Browse the repository at this point in the history
…upport
  • Loading branch information
mwalker174 committed Jan 21, 2023
1 parent 976887b commit c6626ce
Show file tree
Hide file tree
Showing 34 changed files with 1,243 additions and 1,843 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,22 @@ public final class GATKSVVCFConstants {
public static final String CPX_INTERVALS = "CPX_INTERVALS";
public static final String CPX_TYPE = "CPX_TYPE";

public enum ComplexVariantSubtype {
delINV,
INVdel,
dupINV,
INVdup,
delINVdel,
dupINVdup,
delINVdup,
dupINVdel,
piDUP_FR,
piDUP_RF,
dDUP,
dDUP_iDEL,
INS_iDEL
}

// not defined in output vcf header but used in internal id that is currently output in the ID column
public static final String INTERVAL_VARIANT_ID_FIELD_SEPARATOR = "_";
public static final String DUP_TAN_CONTRACTION_INTERNAL_ID_START_STRING = "DEL-DUPLICATION-TANDEM-CONTRACTION";
Expand Down Expand Up @@ -106,7 +122,6 @@ public final class GATKSVVCFConstants {
public static final String CLUSTER_MEMBER_IDS_KEY = "MEMBERS";

// Concordance

public static final String GENOTYPE_CONCORDANCE_INFO = "GENOTYPE_CONCORDANCE";
public static final String NON_REF_GENOTYPE_CONCORDANCE_INFO = "NON_REF_GENOTYPE_CONCORDANCE";

Expand Down Expand Up @@ -149,6 +164,18 @@ public final class GATKSVVCFConstants {
public static final String NEAREST_TSS = "PREDICTED_NEAREST_TSS";
public static final String TSS_DUP = "PREDICTED_TSS_DUP";

// SVTYPE classes
public enum StructuralVariantAnnotationType {
DEL,
DUP,
INS,
INV,
CPX,
BND,
CTX,
CNV
}

///////////////////////////////////////////////////////////////////////////////////////////////////////////////////

}
Original file line number Diff line number Diff line change
Expand Up @@ -4,66 +4,65 @@
import htsjdk.variant.variantcontext.Genotype;

import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

/**
* Simple allele counter for SVs. Supports multi-allelic variants, except for multi-allelic CNVs that lack genotype alleles.
*/
public class SVAlleleCounter {

private final List<Allele> alleles;
private final int[] alleleCounts;
private final int alleleNumber;
private final int[] counts;
private final double[] frequencies;
private final int number;

protected SVAlleleCounter(final List<Allele> alleles,
final int[] alleleCounts,
final int alleleNumber) {
public SVAlleleCounter(final List<Allele> alleles,
final List<Genotype> genotypes) {
this.alleles = alleles;
this.alleleCounts = alleleCounts;
this.alleleNumber = alleleNumber;
final Map<Allele, Long> alleleCountsMap = computeCounts(genotypes);
number = alleleCountsMap.values().stream().mapToInt(Long::intValue).sum();
counts = new int[alleles.size()];
for (int i = 0; i < alleles.size(); i++) {
counts[i] = alleleCountsMap.getOrDefault(alleles.get(i), 0L).intValue();
}
this.frequencies = computeFrequencies(counts, number);
}

public List<Allele> getAlleles() {
return alleles;
}

public int[] getAlleleCounts() {
return alleleCounts;
public int[] getCounts() {
return counts;
}

public int getAlleleNumber() {
return alleleNumber;
public int getNumber() {
return number;
}

public double[] getFrequencies() { return frequencies; }

/**
* Factory method for counting alleles. All calculations except AF are performed here.
* @param alleles alternate alleles to count
* @param genotypes genotypes with assigned alleles
* @return result
* Counts unique alleles in the given set of genotypes.
*/
public static SVAlleleCounter create(final List<Allele> alleles, final List<Genotype> genotypes) {
final Map<Allele, Long> alleleCountsMap = SVCallRecordUtils.getAlleleCounts(genotypes);
final int alleleNumber = alleleCountsMap.values().stream().mapToInt(Long::intValue).sum();

final int[] alleleCounts = new int[alleles.size()];
for (int i = 0; i < alleles.size(); i++) {
alleleCounts[i] = alleleCountsMap.getOrDefault(alleles.get(i), 0L).intValue();
}
return new SVAlleleCounter(alleles, alleleCounts, alleleNumber);
private static Map<Allele, Long> computeCounts(final Collection<Genotype> genotypes) {
return genotypes.stream().map(Genotype::getAlleles).flatMap(Collection::stream)
.collect(Collectors.groupingBy(a -> a, Collectors.counting()));
}

/**
* Get allele frequencies (AF). Avoid calling this repeatedly on the same object, as computations are performed
* on the fly.
* Compute allele frequencies (AF) based on counts.
*/
public double[] computeFrequencies() {
final double[] freq = new double[alleleCounts.length];
if (alleleNumber == 0) {
private static double[] computeFrequencies(final int[] counts, final int number) {
final double[] freq = new double[counts.length];
if (number == 0) {
Arrays.fill(freq, Double.NaN);
} else {
for (int i = 0; i < alleleCounts.length; i++) {
freq[i] = alleleCounts[i] / (double) alleleNumber;
for (int i = 0; i < counts.length; i++) {
freq[i] = counts[i] / (double) number;
}
}
return freq;
Expand Down
105 changes: 44 additions & 61 deletions src/main/java/org/broadinstitute/hellbender/tools/sv/SVCallRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypesContext;
import htsjdk.variant.variantcontext.StructuralVariantType;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
Expand All @@ -30,11 +29,12 @@ public class SVCallRecord implements SVLocatable {
public static final List<String> INVALID_ATTRIBUTES = Lists.newArrayList(
VCFConstants.END_KEY,
GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE,
GATKSVVCFConstants.SVLEN,
GATKSVVCFConstants.CONTIG2_ATTRIBUTE,
GATKSVVCFConstants.END2_ATTRIBUTE,
GATKSVVCFConstants.SVLEN,
GATKSVVCFConstants.STRANDS_ATTRIBUTE,
GATKSVVCFConstants.SVTYPE
GATKSVVCFConstants.SVTYPE,
GATKSVVCFConstants.CPX_TYPE
);

private final String id;
Expand All @@ -44,7 +44,7 @@ public class SVCallRecord implements SVLocatable {
private final String contigB;
private final int positionB;
private final Boolean strandB;
private final StructuralVariantType type;
private final GATKSVVCFConstants.StructuralVariantAnnotationType type;
private final Integer length;
private final List<String> algorithms;
private final List<Allele> alleles;
Expand All @@ -53,21 +53,25 @@ public class SVCallRecord implements SVLocatable {
private final GenotypesContext genotypes;
private final Map<String,Object> attributes;

// CPX related fields
private final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype;

public SVCallRecord(final String id,
final String contigA,
final int positionA,
final Boolean strandA,
final String contigB,
final int positionB,
final Boolean strandB,
final StructuralVariantType type,
final GATKSVVCFConstants.StructuralVariantAnnotationType type,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
final Integer length,
final List<String> algorithms,
final List<Allele> alleles,
final List<Genotype> genotypes,
final Map<String,Object> attributes,
final SAMSequenceDictionary dictionary) {
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, length, algorithms, alleles, genotypes, attributes);
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, length, algorithms, alleles, genotypes, attributes);
validateCoordinates(dictionary);
}

Expand All @@ -78,24 +82,24 @@ protected SVCallRecord(final String id,
final String contigB,
final int positionB,
final Boolean strandB,
final StructuralVariantType type,
final GATKSVVCFConstants.StructuralVariantAnnotationType type,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
final Integer length,
final List<String> algorithms,
final List<Allele> alleles,
final List<Genotype> genotypes,
final Map<String, Object> attributes) {
Utils.nonNull(id);
Utils.nonNull(type);
Utils.nonNull(algorithms);
Utils.nonNull(alleles);
Utils.nonNull(genotypes);
Utils.nonNull(attributes);
this.id = id;
this.id = Utils.nonNull(id);
this.contigA = contigA;
this.positionA = positionA;
this.contigB = contigB;
this.positionB = positionB;
this.type = type;
this.type = Utils.nonNull(type);
this.cpxSubtype = cpxSubtype;
this.algorithms = Collections.unmodifiableList(algorithms);
this.alleles = Collections.unmodifiableList(alleles);
this.altAlleles = alleles.stream().filter(allele -> !allele.isNoCall() && !allele.isReference()).collect(Collectors.toList());
Expand All @@ -110,30 +114,6 @@ protected SVCallRecord(final String id,
this.strandB = strands.getRight();
}

/**
* Convenience constructor without extra attributes
*/
public SVCallRecord(final SVCallRecord baseRecord,
final String id,
final Boolean strandA,
final Boolean strandB,
final StructuralVariantType type,
final Integer length,
final List<String> algorithms,
final List<Allele> alleles,
final List<Genotype> genotypes,
final Map<String,Object> attributes) {
this(id, baseRecord.getContigA(), baseRecord.getPositionA(), strandA, baseRecord.getContigB(),
baseRecord.getPositionB(), strandB, type, length, algorithms, alleles, genotypes, attributes);
}

public SVCallRecord(final SVCallRecord record) {
this(record.id, record.getContigA(), record.getPositionA(), record.getStrandA(),
record.getContigB(), record.getPositionB(), record.getStrandB(),
record.getType(), record.getLength(), record.getAlgorithms(), record.getAlleles(), record.getGenotypes(),
record.getAttributes());
}

/**
* Ensures start/end loci are valid and ordered
*/
Expand Down Expand Up @@ -170,21 +150,23 @@ private static Map<String, Object> validateAttributes(final Map<String, Object>
* @param inputLength assumed length, may be null
* @return variant length, may be null
*/
private static Integer inferLength(final StructuralVariantType type,
private static Integer inferLength(final GATKSVVCFConstants.StructuralVariantAnnotationType type,
final int positionA,
final int positionB,
final Integer inputLength) {
if (type.equals(StructuralVariantType.CNV) || type.equals(StructuralVariantType.DEL)
|| type.equals(StructuralVariantType.DUP) || type.equals(StructuralVariantType.INV)) {
if (type.equals(GATKSVVCFConstants.StructuralVariantAnnotationType.CNV) || type.equals(GATKSVVCFConstants.StructuralVariantAnnotationType.DEL)
|| type.equals(GATKSVVCFConstants.StructuralVariantAnnotationType.DUP) || type.equals(GATKSVVCFConstants.StructuralVariantAnnotationType.INV)) {
// Intrachromosomal classes
final int length = CoordMath.getLength(positionA, positionB);
if (inputLength != null) {
Utils.validateArg(inputLength.intValue() == length, "Input length does not match calculated length");
}
return length;
} else {
Utils.validate(type.equals(StructuralVariantType.INS) || inputLength == null,
"Input length should be null for type " + type.name() + " but found " + inputLength);
if ((type == GATKSVVCFConstants.StructuralVariantAnnotationType.BND
|| type == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) && inputLength != null) {
throw new IllegalArgumentException("Input length should be null for type " + type.name() + " but found " + inputLength);
}
return inputLength;
}
}
Expand All @@ -198,21 +180,21 @@ private static Integer inferLength(final StructuralVariantType type,
* @param inputStrandB assumed second strand, may be null
* @return pair containing (first strand, second strand), which may contain null values
*/
private static Pair<Boolean, Boolean> inferStrands(final StructuralVariantType type,
private static Pair<Boolean, Boolean> inferStrands(final GATKSVVCFConstants.StructuralVariantAnnotationType type,
final Boolean inputStrandA,
final Boolean inputStrandB) {
if (type.equals(StructuralVariantType.CNV)) {
if (type.equals(GATKSVVCFConstants.StructuralVariantAnnotationType.CNV)) {
Utils.validateArg(inputStrandA == null && inputStrandB == null, "Attempted to create CNV with non-null strands");
return Pair.of(null, null);
} else if (type.equals(StructuralVariantType.DEL) || type.equals(StructuralVariantType.INS)) {
} else if (type.equals(GATKSVVCFConstants.StructuralVariantAnnotationType.DEL) || type.equals(GATKSVVCFConstants.StructuralVariantAnnotationType.INS)) {
if (inputStrandA != null) {
Utils.validateArg(inputStrandA.booleanValue() == true, "Attempted to create DEL/INS with negative first strand");
}
if (inputStrandB != null) {
Utils.validateArg(inputStrandB.booleanValue() == false, "Attempted to create DEL/INS with positive second strand");
}
return Pair.of(Boolean.TRUE, Boolean.FALSE);
} else if (type.equals(StructuralVariantType.DUP)) {
} else if (type.equals(GATKSVVCFConstants.StructuralVariantAnnotationType.DUP)) {
if (inputStrandA != null) {
Utils.validateArg(inputStrandA.booleanValue() == false, "Attempted to create DUP with positive first strand");
}
Expand All @@ -221,11 +203,8 @@ private static Pair<Boolean, Boolean> inferStrands(final StructuralVariantType t
}
return Pair.of(Boolean.FALSE, Boolean.TRUE);
} else {
if (type.equals(StructuralVariantType.INV)) {
if (inputStrandA == null && inputStrandB == null) {
return Pair.of(Boolean.TRUE, Boolean.TRUE);
}
Utils.validateArg(inputStrandA.equals(inputStrandB), "Inversions must have matching strands but found " +
if (type.equals(GATKSVVCFConstants.StructuralVariantAnnotationType.INV)) {
Utils.validateArg(Objects.equals(inputStrandA, inputStrandB), "Inversions must have matching strands but found " +
inputStrandA + " / " + inputStrandB);
}
return Pair.of(inputStrandA, inputStrandB);
Expand Down Expand Up @@ -261,19 +240,17 @@ private boolean isCarrier(final Genotype genotype) {
}

// Otherwise, try to infer status if it's a biallelic CNV with a copy number call
if (isSimpleCNV() && altAlleles.size() == 1 && genotype.hasExtendedAttribute(COPY_NUMBER_FORMAT)) {
final int copyNumber = VariantContextGetters.getAttributeAsInt(genotype, COPY_NUMBER_FORMAT, 0);
final Allele allele = altAlleles.get(0);
if (allele.equals(Allele.SV_SIMPLE_DEL)) {
return copyNumber < expectedCopyNumber;
} else if (allele.equals(Allele.SV_SIMPLE_DUP)) {
return copyNumber > expectedCopyNumber;
}
final int copyNumber = VariantContextGetters.getAttributeAsInt(genotype, COPY_NUMBER_FORMAT, expectedCopyNumber);
if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DEL) {
return copyNumber < expectedCopyNumber;
} else if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DUP) {
return copyNumber > expectedCopyNumber;
} else if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV) {
return copyNumber != expectedCopyNumber;
}

// No-calls (that aren't bi-allelic CNVs)
throw new IllegalArgumentException("Cannot determine carrier status for sample " + genotype.getSampleName() +
" in record " + id);
return false;
}

public static int getExpectedCopyNumber(final Genotype genotype) {
Expand Down Expand Up @@ -303,7 +280,9 @@ public boolean isDepthOnly() {
}

public boolean isSimpleCNV() {
return type == StructuralVariantType.DEL || type == StructuralVariantType.DUP || type == StructuralVariantType.CNV;
return type == GATKSVVCFConstants.StructuralVariantAnnotationType.DEL
|| type == GATKSVVCFConstants.StructuralVariantAnnotationType.DUP
|| type == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV;
}

public boolean nullStrands() {
Expand Down Expand Up @@ -347,10 +326,14 @@ public Boolean getStrandB() {
}

@Override
public StructuralVariantType getType() {
public GATKSVVCFConstants.StructuralVariantAnnotationType getType() {
return type;
}

public GATKSVVCFConstants.ComplexVariantSubtype getComplexSubtype() {
return cpxSubtype;
}

public Integer getLength() {
return length;
}
Expand Down
Loading

0 comments on commit c6626ce

Please sign in to comment.