Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Type change int -> long to prevent tranche novel variant count overflow #7864

Merged
merged 2 commits into from
Jun 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,16 @@ public class Tranche {
final double minVQSLod; //minimum value of VQSLOD in this tranche
final double knownTiTv; //titv value of known sites in this tranche
final double novelTiTv; //titv value of novel sites in this tranche
final int numKnown; //number of known sites in this tranche
final int numNovel; //number of novel sites in this tranche
final long numKnown; //number of known sites in this tranche
final long numNovel; //number of novel sites in this tranche
final VariantRecalibratorArgumentCollection.Mode model;
final String name; //Name of the tranche

public Tranche(final String name, final double knownTiTv, final int numNovel, final double minVQSLod, final VariantRecalibratorArgumentCollection.Mode model, final double novelTiTv, final int accessibleTruthSites, final int numKnown, final int callsAtTruthSites) {
public Tranche(final String name, final double knownTiTv, final long numNovel, final double minVQSLod,
final VariantRecalibratorArgumentCollection.Mode model, final double novelTiTv,
final int accessibleTruthSites, final long numKnown, final int callsAtTruthSites) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you check the upstream call sites for uses of int to store the number of known/novel sites? If there's an int anywhere in the call chain the overflow issues will likely persist...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found one in the input parsing and fixed it. (The number of truth sites values are also ints, but with the resources we use today they're way, way below Integer.MAX_VALUE and I don't expect that to ever change based on the size of the human genome.)

if ( numKnown < 0 || numNovel < 0) {
throw new GATKException("Invalid tranche - no. variants is < 0 : known " + numKnown + " novel " + numNovel);
throw new GATKException("Invalid tranche " + name + " - no. variants is < 0 : known " + numKnown + " novel " + numNovel);
}

if ( name == null ) {
Expand Down Expand Up @@ -104,7 +106,8 @@ public <T extends Tranche> String getTrancheString(final T prev) {
}

protected static Tranche trancheOfVariants(final List<VariantDatum> data, final int minI, final double ts, final VariantRecalibratorArgumentCollection.Mode model ) {
int numKnown = 0, numNovel = 0, knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0;
long numKnown = 0, numNovel = 0;
int knownTi = 0, knownTv = 0, novelTi = 0, novelTv = 0;

final double minLod = data.get(minI).lod;
for (final VariantDatum datum : data) {
Expand Down Expand Up @@ -147,8 +150,8 @@ protected static Tranche emptyTranche(final List<VariantDatum> data, final int m

final double knownTiTv = 0.0;
final double novelTiTv = 0.0;
final int numKnown = 0;
final int numNovel = 0;
final long numKnown = 0;
final long numNovel = 0;

return new Tranche("unnamed", knownTiTv, numNovel, minLod, model, novelTiTv, accessibleTruthSites, numKnown, nCallsAtTruth);
}
Expand Down Expand Up @@ -194,6 +197,26 @@ protected static int getOptionalInteger(final Map<String, String> bindings, fina
}
}

protected static long getRequiredLong(final Map<String, String> bindings, final String key) {
if ( bindings.containsKey(key) ) {
try{
return Long.valueOf(bindings.get(key));
} catch (NumberFormatException e){
throw new UserException.MalformedFile("Malformed tranches file. Invalid value for key " + key);
}
} else {
throw new UserException.MalformedFile("Malformed tranches file. Missing required key " + key);
}
}

protected static long getOptionalLong(final Map<String, String> bindings, final String key, final int defaultValue) {
try{
return Long.valueOf(bindings.getOrDefault(key, String.valueOf(defaultValue)));
} catch (NumberFormatException e){
throw new UserException.MalformedFile("Malformed tranches file. Invalid value for key " + key);
}
}

protected double getTruthSensitivity() {
return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,9 @@ final class TruthSensitivityTranche extends Tranche {
public TruthSensitivityTranche(
final double targetTruthSensitivity,
final double minVQSLod,
final int numKnown,
final long numKnown,
final double knownTiTv,
final int numNovel,
final long numNovel,
final double novelTiTv,
final int accessibleTruthSites,
final int callsAtTruthSites,
Expand All @@ -41,9 +41,9 @@ public TruthSensitivityTranche(
public TruthSensitivityTranche(
final double targetTruthSensitivity,
final double minVQSLod,
final int numKnown,
final long numKnown,
final double knownTiTv,
final int numNovel,
final long numNovel,
final double novelTiTv,
final int accessibleTruthSites,
final int callsAtTruthSites,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@ public Double getTrancheIndex() {

public VQSLODTranche(
final double minVQSLod,
final int numKnown,
final long numKnown,
final double knownTiTv,
final int numNovel,
final long numNovel,
final double novelTiTv,
final int accessibleTruthSites,
final int callsAtTruthSites,
Expand Down Expand Up @@ -114,9 +114,9 @@ public static List<VQSLODTranche> readTranches(final GATKPath f) throws IOExcept
}
tranches.add(new VQSLODTranche(
getRequiredDouble(bindings, "minVQSLod"),
getOptionalInteger(bindings, "numKnown", -1),
getOptionalLong(bindings, "numKnown", -1),
getOptionalDouble(bindings, "knownTiTv", -1.0),
getRequiredInteger(bindings, "numNovel"),
getRequiredLong(bindings, "numNovel"),
getRequiredDouble(bindings, "novelTiTv"),
getOptionalInteger(bindings, "accessibleTruthSites", -1),
getOptionalInteger(bindings, "callsAtTruthSites", -1),
Expand Down Expand Up @@ -178,10 +178,10 @@ public static List<TruthSensitivityTranche> mergeAndConvertTranches(final TreeMa

public static VQSLODTranche mergeAndConvertTranches(final List<VQSLODTranche> scatteredTranches, VariantRecalibratorArgumentCollection.Mode mode) {
double indexVQSLOD = scatteredTranches.get(0).minVQSLod;
int sumNumKnown = 0;
long sumNumKnown = 0;
double sumKnownTransitions = 0;
double sumKnownTransversions = 0;
int sumNumNovel = 0;
long sumNumNovel = 0;
double sumNovelTransitions = 0;
double sumNovelTransversions = 0;
int sumAccessibleTruthSites = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,13 @@
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

/**
* Created by gauthier on 7/18/17.
Expand All @@ -16,45 +20,36 @@ public class GatherTranchesIntegrationTest extends CommandLineProgramTest {

private static final String testDir = GATKBaseTest.publicTestDir + "/large/VQSR/";

@Test
public void testCombine2Shards() throws Exception {
final File recal1 = new File(testDir + "snpTranches.scattered.txt"); //this is the output of VariantRecalibratorIntegrationTest.testVariantRecalibratorSNPscattered
final File recal2 = new File(testDir + "snpTranches.scattered.2.txt"); //this is a copy of the above
@DataProvider(name = "testInputs")
public Object[][] getTestInputs () {
return new Object[][]{
{Arrays.asList(new File(testDir + "snpTranches.scattered.txt"), new File(testDir + "snpTranches.scattered.txt")),
new File(testDir + "expected/snpTranches.gathered.txt"), "SNP"},

final File recal_original = new File(testDir + "expected/snpTranches.gathered.txt");
{Arrays.asList(new File(testDir + "indels.0.tranches"), new File(testDir + "indels.1.tranches")),
new File(testDir + "expected/indels.gathered.tranches"), "INDEL"},

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addRaw("--input");
args.addRaw(recal1.getAbsolutePath());
args.addRaw("--input");
args.addRaw(recal2.getAbsolutePath());
args.add("mode", "SNP");
{Arrays.asList(new File(testDir + "test-single-giant-input-snps.tranches")),
new File(testDir + "expected/singleOverflow.tranches"), "SNP"},

final File outFile = GATKBaseTest.createTempFile("gatheredTranches", ".txt");
args.addOutput(outFile);
final Object res = this.runCommandLine(args.getArgsArray());
Assert.assertEquals(res, 0);
IntegrationTestSpec.assertEqualTextFiles(outFile, recal_original);
{Arrays.asList(new File(testDir + "test-very-large-one-snps.tranches"), new File(testDir + "test-very-large-two-snps.tranches")),
new File(testDir + "expected/testSummedOverflow.tranches"), "SNP"}
};
}

@Test
public void testCombine2IndelTranches() throws Exception {
final File tranches1 = new File(testDir + "indels.0.tranches");
final File tranches2 = new File(testDir + "indels.1.tranches");

final File recal_original = new File(testDir + "expected/indels.gathered.tranches");

@Test (dataProvider = "testInputs")
public void testGatherTranches(List<File> inputs, File expected, String mode) throws IOException {
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addRaw("--input");
args.addRaw(tranches1.getAbsolutePath());
args.addRaw("--input");
args.addRaw(tranches2.getAbsolutePath());
args.add("mode", "INDEL");
for (File inFile : inputs) {
args.addRaw("--input");
args.addRaw(inFile);
}
args.add("mode", mode);

final File outFile = GATKBaseTest.createTempFile("gatheredTranches", ".txt");
args.addOutput(outFile);
final Object res = this.runCommandLine(args.getArgsArray());
Assert.assertEquals(res, 0);
IntegrationTestSpec.assertEqualTextFiles(outFile, recal_original);
IntegrationTestSpec.assertEqualTextFiles(outFile, expected);
}
}
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown