Skip to content

Commit

Permalink
Performed a round of ablation on new annotation-based filtering tools. (
Browse files Browse the repository at this point in the history
#8131)

* Performed a round of ablation on new annotation-based filtering tools.

* Removed Javadoc tags unsupported by Barclay in VETS tool documentation and fixed other minor documentation issues.
  • Loading branch information
samuelklee authored and rickymagner committed Nov 28, 2023
1 parent b921612 commit ea6ae82
Show file tree
Hide file tree
Showing 108 changed files with 269 additions and 710 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ fi
echo "Docker build done =========="

sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering.json >$WORKING_DIR/vcf_site_level_filtering_mod.json
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" $CROMWELL_TEST_DIR/vcf_site_level_filtering_pos_neg.json >$WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json

echo "Running Filtering WDL through cromwell"

Expand All @@ -41,6 +40,3 @@ done
FIN
cat $WORKING_DIR/vcf_site_level_filtering_mod.json
java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_mod.json

cat $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json
java -jar $CROMWELL_JAR run $WDL_DIR/JointVcfFiltering.wdl -i $WORKING_DIR/vcf_site_level_filtering_pos_neg_mod.json

This file was deleted.

17 changes: 12 additions & 5 deletions scripts/vcf_site_level_filtering_wdl/JointVcfFiltering.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ workflow JointVcfFiltering {
String resource_args

String? model_backend
File? python_script
File? training_python_script
File? hyperparameters_json
File? scoring_python_script

String? extract_extra_args
String? train_extra_args
Expand All @@ -55,9 +56,9 @@ workflow JointVcfFiltering {
model_backend: "(Optional) Model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
python_script: "(Optional) Python script specifying custom model backend to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
hyperparameters_json: "(Optional) JSON file specifying model hyperparameters to be used by TrainVariantAnnotationsModel. See GATK documentation for this tool."
extract_extra_args: "(Optional) Catch-all string to provide additional arguments for ExtractVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
train_extra_args: "(Optional) Catch-all string to provide additional arguments for TrainVariantAnnotationsModel. This can include variant-type modes, arguments for enabling positive-negative training, etc. See GATK documentation for this tool."
score_extra_args: "(Optional) Catch-all string to provide additional arguments for ScoreVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-negative training and hard filtering, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
extract_extra_args: "(Optional) Catch-all string to provide additional arguments for ExtractVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-unlabeled learning, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
train_extra_args: "(Optional) Catch-all string to provide additional arguments for TrainVariantAnnotationsModel. This can include variant-type modes, arguments for enabling positive-unlabeled learning, etc. See GATK documentation for this tool."
score_extra_args: "(Optional) Catch-all string to provide additional arguments for ScoreVariantAnnotations. This can include intervals (as string arguments or non-localized files), variant-type modes, arguments for enabling positive-unlabeled learning and hard filtering, etc. The \"do-not-gzip-vcf-output\" argument is not supported by this workflow. See GATK documentation for this tool."
}

call ExtractVariantAnnotations {
Expand All @@ -79,7 +80,7 @@ workflow JointVcfFiltering {
annotations_hdf5 = ExtractVariantAnnotations.annotations_hdf5,
unlabeled_annotations_hdf5 = ExtractVariantAnnotations.unlabeled_annotations_hdf5,
model_backend = model_backend,
python_script = python_script,
python_script = training_python_script,
hyperparameters_json = hyperparameters_json,
output_prefix = output_prefix,
extra_args = train_extra_args,
Expand All @@ -101,6 +102,8 @@ workflow JointVcfFiltering {
extracted_vcf_idx = ExtractVariantAnnotations.extracted_vcf_idx,
model_prefix = output_prefix,
model_files = TrainVariantAnnotationsModel.model_files,
model_backend = model_backend,
python_script = scoring_python_script,
extra_args = score_extra_args,
gatk_docker = gatk_docker,
gatk_override = gatk_override,
Expand Down Expand Up @@ -251,6 +254,8 @@ task ScoreVariantAnnotations {
File extracted_vcf_idx
String model_prefix
Array[File] model_files
String? model_backend
File? python_script
String? extra_args
File? monitoring_script

Expand Down Expand Up @@ -287,6 +292,8 @@ task ScoreVariantAnnotations {
~{resource_args} \
--resource:extracted,extracted=true ~{extracted_vcf} \
--model-prefix model-files/~{model_prefix}.train \
~{"--model-backend " + model_backend} \
~{"--python-script " + python_script} \
~{extra_args}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
* Extracts site-level variant annotations, labels, and other metadata from a VCF file to HDF5 files.
*
* <p>
* This tool is intended to be used as the first step in a variant-filtering workflow that supersedes the
* This tool is primarily intended to be used as the first step in a variant-filtering workflow that supersedes the
* {@link VariantRecalibrator} workflow. This tool extracts site-level annotations, labels, and other relevant metadata
* from variant sites (or alleles, in allele-specific mode) that are or are not present in specified labeled
* resource VCFs (e.g., training or calibration VCFs). Input sites that are present in the resources are considered
Expand Down Expand Up @@ -65,7 +65,7 @@
* <ul>
* <li>
* Input VCF file. Site-level annotations will be extracted from the contained variants (or alleles,
* if the {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} argument is specified).
* if at least one allele-specific annotation with "Number=A" is specified).
* </li>
* <li>
* Annotations to extract.
Expand All @@ -78,13 +78,12 @@
* </li>
* <li>
* (Optional) Resource VCF file(s). Each resource should be tagged with a label, which will be assigned to
* extracted sites that are present in the resource. In typical use, the {@value LabeledVariantAnnotationsData#TRAINING_LABEL}
* and {@value LabeledVariantAnnotationsData#CALIBRATION_LABEL} labels should be used to tag at least one resource
* apiece. The resulting sets of sites will be used for model training and conversion of scores to
* extracted sites that are present in the resource. In typical use, the "training"
* and "calibration" labels should be used to tag at least one resource apiece.
* The resulting sets of sites will be used for model training and conversion of scores to
* calibration-set sensitivity, respectively; the trustworthiness of the respective resources should be
* taken into account accordingly. The {@value LabeledVariantAnnotationsData#SNP_LABEL} label is
* reserved by the tool, as it is used to label sites determined to be SNPs, and thus it cannot be used to tag
* provided resources.
* taken into account accordingly. The "snp" label is reserved by the tool, as it is used to label sites
* determined to be SNPs, and thus it cannot be used to tag provided resources.
* </li>
* <li>
* (Optional) Maximum number of unlabeled variants (or alleles) to randomly sample with reservoir sampling.
Expand Down Expand Up @@ -128,19 +127,19 @@
* <p>
* Here, each chunk is a double matrix, with dimensions given by (number of sites in the chunk) x (number of annotations).
* See the methods {@link HDF5Utils#writeChunkedDoubleMatrix} and {@link HDF5Utils#writeIntervals} for additional details.
* If {@value USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME} is specified, each record corresponds to an individual allele;
* In allele-specific mode (i.e., when allele-specific annotations are requested), each record corresponds to an individual allele;
* otherwise, each record corresponds to a variant site, which may contain multiple alleles.
* Storage of alleles can be omitted using the {@value OMIT_ALLELES_IN_HDF5_LONG_NAME} argument, which will reduce
* Storage of alleles can be omitted using the "--omit-alleles-in-hdf5" argument, which will reduce
* the size of the file. This file will only be produced if resources are provided and the number of extracted
* labeled sites is nonzero.
* </p>
*
* </li>
* <li>
* Labeled sites-only VCF file and index. The VCF will not be gzipped if the {@value DO_NOT_GZIP_VCF_OUTPUT_LONG_NAME}
* Labeled sites-only VCF file and index. The VCF will not be gzipped if the "--do-not-gzip-vcf-output"
* argument is set to true. The VCF can be provided as a resource in subsequent runs of
* {@link ScoreVariantAnnotations} and used to indicate labeled sites that were extracted.
* This can be useful if the {@value StandardArgumentDefinitions#INTERVALS_LONG_NAME} argument was used to
* This can be useful if the "--intervals/-L" argument was used to
* subset sites in training or calibration resources for extraction; this may occur when setting up
* training/validation/test splits, for example. Note that records for the random sample of unlabeled sites are
* currently not included in the VCF.
Expand All @@ -149,7 +148,7 @@
* (Optional) Unlabeled-annotations HDF5 file. This will have the same directory structure as in the
* labeled-annotations HDF5 file. However, note that records are currently written in the order they
* appear in the downsampling reservoir after random sampling, and hence, are not in genomic order.
* This file will only be produced if a nonzero value of the {@value MAXIMUM_NUMBER_OF_UNLABELED_VARIANTS_LONG_NAME}
* This file will only be produced if a nonzero value of the "--maximum-number-of-unlabeled-variants"
* argument is provided.
* </li>
* </ul>
Expand All @@ -158,9 +157,9 @@
*
* <p>
* Extract annotations from training/calibration SNP/INDEL sites, producing the outputs
* 1) {@code extract.annot.hdf5}, 2) {@code extract.vcf.gz}, and 3) {@code extract.vcf.gz.tbi}.
* 1) extract.annot.hdf5, 2) extract.vcf.gz, and 3) extract.vcf.gz.tbi.
* The HDF5 file can then be provided to {@link TrainVariantAnnotationsModel}
* to train a model using a positive-only approach. Note that the {@value MODE_LONG_NAME} arguments are made
* to train a model using a positive-only approach. Note that the "--mode" arguments are made
* explicit here, although both SNP and INDEL modes are selected by default.
*
* <pre>
Expand All @@ -182,11 +181,10 @@
* <p>
* Extract annotations from both training/calibration SNP/INDEL sites and a random sample of
* 1000000 unlabeled (i.e., non-training/calibration) sites, producing the outputs
* 1) {@code extract.annot.hdf5}, 2) {@code extract.unlabeled.annot.hdf5}, 3) {@code extract.vcf.gz},
* and 4) {@code extract.vcf.gz.tbi}. The HDF5 files can then be provided to {@link TrainVariantAnnotationsModel}
* to train a model using a positive-negative approach (similar to that used in {@link VariantRecalibrator}).
* Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both SNP and INDEL modes are
* selected by default.
* 1) extract.annot.hdf5, 2) extract.unlabeled.annot.hdf5, 3) extract.vcf.gz,
* and 4) extract.vcf.gz.tbi. The HDF5 files can then be provided to {@link TrainVariantAnnotationsModel}
* to train a model using a positive-unlabeled approach. Note that the "--mode" arguments
* are made explicit here, although both SNP and INDEL modes are selected by default.
*
* <pre>
* gatk ExtractVariantAnnotations \
Expand All @@ -200,17 +198,23 @@
* --mode INDEL \
* --resource:indel-training,training=true indel-training.vcf \
* --resource:indel-calibration,calibration=true indel-calibration.vcf \
* --maximum-number-of-unlableled-variants 1000000
* --maximum-number-of-unlabeled-variants 1000000
* -O extract
* </pre>
* </p>
*
* <p>
* Note that separate SNP and INDEL resources are shown in the above examples purely for demonstration purposes,
* as are separate training and calibration resources. However, it may be desirable to specify combined
* resource(s); e.g., "--resource:snp-and-indel-resource,training=true,calibration=true snp-and-indel-resource.vcf".
* </p>
*
* <p>
* In the (atypical) event that resource VCFs are unavailable, one can still extract annotations from a random sample of
* unlabeled sites, producing the outputs 1) {@code extract.unlabeled.annot.hdf5},
* 2) {@code extract.vcf.gz} (which will contain no records), and 3) {@code extract.vcf.gz.tbi}.
* unlabeled sites, producing the outputs 1) extract.unlabeled.annot.hdf5,
* 2) extract.vcf.gz (which will contain no records), and 3) extract.vcf.gz.tbi.
* This random sample cannot be used by {@link TrainVariantAnnotationsModel}, but may still be useful for
* exploratory analyses. Note that the {@value MODE_LONG_NAME} arguments are made explicit here, although both
* exploratory analyses. Note that the "--mode" arguments are made explicit here, although both
* SNP and INDEL modes are selected by default.
*
* <pre>
Expand All @@ -221,12 +225,20 @@
* -A annotation_N \
* --mode SNP \
* --mode INDEL \
* --maximum-number-of-unlableled-variants 1000000
* --maximum-number-of-unlabeled-variants 1000000
* -O extract
* </pre>
* </p>
*
* DEVELOPER NOTE: See documentation in {@link LabeledVariantAnnotationsWalker}.
* <p>
* Alternatively, if resource VCFs are unavailable, one might want to specify the input VCF itself as a resource
* and extract annotations for the input variants (or a subset thereof). Again, this may be useful for
* exploratory analyses.
* </p>
*
* <p>
* DEVELOPER NOTE: See documentation in {@link LabeledVariantAnnotationsWalker}.
* </p>
*
* @author Samuel Lee &lt;slee@broadinstitute.org&gt;
*/
Expand All @@ -249,11 +261,10 @@ public final class ExtractVariantAnnotations extends LabeledVariantAnnotationsWa
doc = "Maximum number of unlabeled variants to extract. " +
"If greater than zero, reservoir sampling will be used to randomly sample this number " +
"of sites from input sites that are not present in the specified resources. " +
"Choice of this number should be guided by considerations for training the negative model in " +
"Choice of this number should be guided by considerations for training the model in " +
"TrainVariantAnnotationsModel; users may wish to choose a number that is comparable to the " +
"expected size of the labeled training set or that is compatible with available memory resources. " +
"Note that in allele-specific mode (--" + LabeledVariantAnnotationsWalker.USE_ALLELE_SPECIFIC_ANNOTATIONS_LONG_NAME +
" true), this argument limits the number of variant records, rather than the number of alleles.",
"Note that in allele-specific mode, this argument limits the number of variant records, rather than the number of alleles.",
minValue = 0)
private int maximumNumberOfUnlabeledVariants = 0;

Expand Down
Loading

0 comments on commit ea6ae82

Please sign in to comment.