-
Notifications
You must be signed in to change notification settings - Fork 596
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
VET Ingest Validation / Allow Ingest of non-VQSR'ed data #7870
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ public class VetCreator { | |
private final String sampleId; | ||
private PendingBQWriter vetBQJsonWriter = null; | ||
private final boolean forceLoadingFromNonAlleleSpecific; | ||
private final boolean skipLoadingVqsrFields; | ||
|
||
private static final String VET_FILETYPE_PREFIX = "vet_"; | ||
|
||
|
@@ -35,10 +36,11 @@ public static boolean doRowsExistFor(CommonCode.OutputType outputType, String pr | |
return BigQueryUtils.doRowsExistFor(projectId, datasetName, VET_FILETYPE_PREFIX + tableNumber, sampleId); | ||
} | ||
|
||
public VetCreator(String sampleIdentifierForOutputFileName, String sampleId, String tableNumber, final File outputDirectory, final CommonCode.OutputType outputType, final String projectId, final String datasetName, final boolean forceLoadingFromNonAlleleSpecific) { | ||
public VetCreator(String sampleIdentifierForOutputFileName, String sampleId, String tableNumber, final File outputDirectory, final CommonCode.OutputType outputType, final String projectId, final String datasetName, final boolean forceLoadingFromNonAlleleSpecific, final boolean skipLoadingVqsrFields) { | ||
this.sampleId = sampleId; | ||
this.outputType = outputType; | ||
this.forceLoadingFromNonAlleleSpecific = forceLoadingFromNonAlleleSpecific; | ||
this.skipLoadingVqsrFields = skipLoadingVqsrFields; | ||
|
||
try { | ||
String PREFIX_SEPARATOR = "_"; | ||
|
@@ -64,11 +66,6 @@ public VetCreator(String sampleIdentifierForOutputFileName, String sampleId, Str | |
public void apply(VariantContext variant) throws IOException { | ||
int start = variant.getStart(); | ||
long location = SchemaUtils.encodeLocation(variant.getContig(), start); | ||
List<String> row = createRow( | ||
location, | ||
variant, | ||
sampleId | ||
); | ||
Comment on lines
-67
to
-71
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TOL perhaps rather than deleting this and inlining the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure I understand. Do you not like the inlining and want the code block to remain explicit in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, I was suggesting to keep this block explicit but only in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
|
||
switch(outputType) { | ||
case BQ: | ||
|
@@ -80,7 +77,7 @@ public void apply(VariantContext variant) throws IOException { | |
break; | ||
case TSV: | ||
// write the variant to the XSV | ||
vetWriter.getNewLineBuilder().setRow(row).write(); | ||
vetWriter.getNewLineBuilder().setRow(createRow(location, variant, sampleId)).write(); | ||
break; | ||
|
||
} | ||
|
@@ -91,9 +88,11 @@ public List<String> createRow(final long location, final VariantContext variant, | |
for ( final VetFieldEnum fieldEnum : VetFieldEnum.values() ) { | ||
if (fieldEnum.equals(VetFieldEnum.location)) { | ||
row.add(String.valueOf(location)); | ||
} else if (fieldEnum.equals(VetFieldEnum.sample_id)) { | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, that's me. I followed a different style guide in a former life and don't like the way it looks like that. But I'll stick with what IJ does. |
||
else if (fieldEnum.equals(VetFieldEnum.sample_id)) { | ||
row.add(sampleId); | ||
} else { | ||
} | ||
else if ( !(skipLoadingVqsrFields && fieldEnum.isVqsrSpecificField() ) ) { | ||
row.add(fieldEnum.getColumnValue(variant, forceLoadingFromNonAlleleSpecific)); | ||
} | ||
} | ||
|
@@ -106,12 +105,15 @@ public JSONObject createJson(final long location, final VariantContext variant, | |
for ( final VetFieldEnum fieldEnum : VetFieldEnum.values() ) { | ||
if (fieldEnum.equals(VetFieldEnum.location)) { | ||
jsonObject.put(VetFieldEnum.location.toString(), location); | ||
} else if (fieldEnum.equals(VetFieldEnum.sample_id)) { | ||
} | ||
else if (fieldEnum.equals(VetFieldEnum.sample_id)) { | ||
jsonObject.put(VetFieldEnum.sample_id.toString(), sampleId); | ||
} else if (fieldEnum.equals(VetFieldEnum.call_GQ)) { | ||
} | ||
else if (fieldEnum.equals(VetFieldEnum.call_GQ)) { | ||
jsonObject.put(fieldEnum.toString(), Integer.valueOf(fieldEnum.getColumnValue(variant, forceLoadingFromNonAlleleSpecific))); | ||
} else { | ||
String strVal = fieldEnum.getColumnValue(variant, forceLoadingFromNonAlleleSpecific); | ||
} | ||
else { | ||
final String strVal = !(skipLoadingVqsrFields && fieldEnum.isVqsrSpecificField()) ? fieldEnum.getColumnValue(variant, forceLoadingFromNonAlleleSpecific) : ""; | ||
jsonObject.put(fieldEnum.toString(), StringUtils.isEmpty(strVal) ? null : strVal); | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,12 +30,12 @@ | |
* location, // req | ||
* reference_bases, // req | ||
* alternate_bases_alt, // req | ||
* alternate_bases_AS_RAW_MQ, // req | ||
* alternate_bases_AS_RAW_MQ, | ||
* alternate_bases_AS_RAW_MQRankSum, | ||
* alternate_bases_AS_QUALapprox, // req | ||
* alternate_bases_AS_QUALapprox, | ||
* alternate_bases_AS_RAW_ReadPosRankSum, | ||
* alternate_bases_AS_SB_TABLE, // req | ||
* alternate_bases_AS_VarDP, // req | ||
* alternate_bases_AS_SB_TABLE, | ||
* alternate_bases_AS_VarDP, | ||
* call_genotype, // req | ||
* call_AD, | ||
* call_DP, // Laura says consider removing for now-- so similar to AS_VarDP | ||
|
@@ -76,7 +76,7 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo | |
}, | ||
|
||
AS_RAW_MQ { | ||
// Required | ||
// Required for VQSR Data | ||
// can strip off the first one? | ||
// TODO sci notation? | ||
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) { | ||
|
@@ -87,10 +87,10 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo | |
throw new UserException("Cannot be missing required value for alternate_bases.AS_RAW_MQ, RAW_MQandDP or RAW_MQ."); | ||
} | ||
if (!forceLoadingFromNonAlleleSpecific && out != null) { | ||
if(!out.endsWith("|0.00")) { | ||
// KC: we are seeing a TON of these! | ||
// if (!out.endsWith("|0.00")) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. commented out code There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, @kcibul commented out the logger statement, I just expanded the commenting out to take out all the functional code. |
||
// logger.warn("Expected AS_RAW_MQ value to end in |0.00. value is: " + out + " for variant " + variant.toString()); | ||
} | ||
// } | ||
out = out.substring(0, out.lastIndexOf("|")); | ||
String[] outValues = out.split("\\|"); | ||
out = Arrays | ||
|
@@ -124,9 +124,13 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo | |
return outNotAlleleSpecific; | ||
} | ||
} | ||
public boolean isVqsrSpecificField() { | ||
return true; | ||
} | ||
}, | ||
|
||
AS_RAW_MQRankSum { // TODO -- maybe rely on 1/1 for call_GT, also get rid of the | at the beginning | ||
// Required for VQSR Data | ||
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) { | ||
// in the case where neither allele is reference, don't return a value | ||
if (isGenotypeAllNonRef(variant.getGenotype(0))) { | ||
|
@@ -175,27 +179,34 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo | |
//throw new UserException("Expected AS_RAW_MQRankSum value to be ||, ||| or to end in |NaN"); | ||
} | ||
return out; | ||
|
||
} | ||
public boolean isVqsrSpecificField() { | ||
return true; | ||
} | ||
}, | ||
|
||
QUALapprox { // Required | ||
QUALapprox { | ||
// Required for VQSR Data | ||
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) { | ||
String out = getAttribute(variant, RAW_QUAL_APPROX_KEY, null); | ||
if (out == null) { | ||
throw new UserException("Cannot be missing required value for QUALapprox at site: " + variant.toString()); | ||
throw new UserException("Cannot be missing required value for QUALapprox at site: " + variant); | ||
} | ||
return out; | ||
} | ||
public boolean isVqsrSpecificField() { | ||
return true; | ||
} | ||
}, | ||
|
||
AS_QUALapprox { // Required | ||
AS_QUALapprox { | ||
// Required for VQSR Data | ||
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) { | ||
String out = getAttribute(variant, AS_RAW_QUAL_APPROX_KEY, null); | ||
if (out == null) { | ||
String outNotAlleleSpecific = getAttribute(variant, RAW_QUAL_APPROX_KEY, null); | ||
if (outNotAlleleSpecific == null) { | ||
throw new UserException("Cannot be missing required value for AS_QUALapprox or QUALapprox at site: " + variant.toString()); | ||
throw new UserException("Cannot be missing required value for AS_QUALapprox or QUALapprox at site: " + variant); | ||
} | ||
return outNotAlleleSpecific; | ||
} | ||
|
@@ -217,6 +228,9 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo | |
} | ||
return out; | ||
} | ||
public boolean isVqsrSpecificField() { | ||
return true; | ||
} | ||
}, | ||
|
||
AS_RAW_ReadPosRankSum { // TODO -- maybe rely on 1/1 for call_GT | ||
|
@@ -268,9 +282,13 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo | |
} | ||
return out; | ||
} | ||
public boolean isVqsrSpecificField() { | ||
return true; | ||
} | ||
}, | ||
|
||
AS_SB_TABLE { // Required | ||
AS_SB_TABLE { | ||
// Required for VQSR Data | ||
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) { | ||
String out = getAttribute(variant, GATKVCFConstants.AS_SB_TABLE_KEY, null); | ||
if (forceLoadingFromNonAlleleSpecific || out == null) { | ||
|
@@ -296,16 +314,20 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo | |
} | ||
return out; | ||
} | ||
public boolean isVqsrSpecificField() { | ||
return true; | ||
} | ||
}, | ||
|
||
AS_VarDP { // Required | ||
AS_VarDP { | ||
// Required for VQSR Data | ||
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) { | ||
String out = getAttribute(variant, AS_VARIANT_DEPTH_KEY, null); | ||
if (out == null) { | ||
String varDP = getAttribute(variant, VARIANT_DEPTH_KEY, null); | ||
String dP = getAttribute(variant, DEPTH_KEY, null); | ||
if (varDP == null || dP == null) { | ||
throw new UserException("Cannot be missing required value for AS_VarDP, or VarDP and DP, at site:" + variant.toString()); | ||
throw new UserException("Cannot be missing required value for AS_VarDP, or VarDP and DP, at site:" + variant); | ||
} | ||
int refDP = Integer.parseInt(dP) - Integer.parseInt(varDP); | ||
out = refDP + "|" + varDP + "|"; | ||
|
@@ -319,10 +341,16 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo | |
} | ||
return out; | ||
} | ||
public boolean isVqsrSpecificField() { | ||
return true; | ||
} | ||
}, | ||
|
||
call_GT { | ||
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) { | ||
if (!variant.hasGenotypes()) { | ||
throw new UserException("Cannot be missing required value for call.GT"); | ||
} | ||
// TODO how is missing handled? | ||
return CommonCode.getGTString(variant); | ||
} | ||
|
@@ -357,8 +385,7 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo | |
call_GQ { // Required | ||
public String getColumnValue(final VariantContext variant, final boolean forceLoadingFromNonAlleleSpecific) { | ||
if (!variant.getGenotype(0).hasGQ()) { | ||
//throw new UserException("Cannot be missing required value for call.GQ"); | ||
return ""; | ||
throw new UserException("Cannot be missing required value for call.GQ"); | ||
} | ||
return String.valueOf(variant.getGenotype(0).getGQ()); | ||
} | ||
|
@@ -389,6 +416,10 @@ public String getColumnValue(final VariantContext variant, final boolean forceLo | |
throw new IllegalArgumentException("Not implemented"); | ||
} | ||
|
||
public boolean isVqsrSpecificField() { | ||
return false; | ||
} | ||
|
||
public static boolean isGenotypeAllNonRef(Genotype g) { | ||
return g.getAllele(0).isNonReference() && g.getAllele(1).isNonReference(); | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This addresses VS-455