Skip to content

Commit

Permalink
Updated data source inputs to accept NIO paths for backing files.
Browse files Browse the repository at this point in the history
Now you can specify a URL in the backing file areas of the configuration
files for Funcotator data sources and the backing files will be read by
the FuncotationDataSourceFactories.

This effectively enables use of data sources in the cloud or a mix of
local- and cloud-based data sources through a config file change.

This update will enable gnomAD annotations (once the data sources are
    updated to point at the gnomAD files on Google Cloud).

Fixes #5348
  • Loading branch information
jonn-smith committed Oct 30, 2018
1 parent 3141e7f commit 1711e2c
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 130 deletions.
2 changes: 1 addition & 1 deletion scripts/funcotator/testing/run_oncotator_VCF_in.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ FUNCOTATOR_EQUIVALENT_DB="~/Development/oncotator_testing/funcotator_dbdir/"

VCF_IN="/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSet1.vcf"
#VCF_IN="/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestVariantSet2.vcf"
#VCF_IN="tmp.vcf"
VCF_IN="/Users/jonn/Development/gatk/src/test/resources/large/funcotator/regressionTestHg19Large.vcf"
VCF_IN="tmp.vcf"

source ~/Development/oncotator_venv/bin/activate

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -293,74 +293,6 @@ public static List<DataSourceFuncotationFactory> createDataSourceFuncotationFact
return dataSourceFactories;
}

/**
* Create a {@link List} of {@link DataSourceFuncotationFactory} based on meta data on the data sources, overrides, and transcript reporting priority information.
* THIS METHOD IS FOR TESTING ONLY!
* @param dataSourceMetaData {@link Map} of {@link Path}->{@link Properties} containing metadata about each data source. Must not be {@code null}.
* @param annotationOverridesMap {@link LinkedHashMap} of {@link String}->{@link String} containing any annotation overrides to include in data sources. Must not be {@code null}.
* @param transcriptSelectionMode {@link TranscriptSelectionMode} to use when choosing the transcript for detailed reporting. Must not be {@code null}.
* @param userTranscriptIdSet {@link Set} of {@link String}s containing transcript IDs of interest to be selected for first. Must not be {@code null}.
* @return A {@link List} of {@link DataSourceFuncotationFactory} given the data source metadata, overrides, and transcript reporting priority information.
*/
@VisibleForTesting
public static List<DataSourceFuncotationFactory> createDataSourceFuncotationFactoriesForDataSourcesForTesting(
final Map<Path, Properties> dataSourceMetaData,
final LinkedHashMap<String, String> annotationOverridesMap,
final TranscriptSelectionMode transcriptSelectionMode,
final Set<String> userTranscriptIdSet) {
Utils.nonNull(dataSourceMetaData);
Utils.nonNull(annotationOverridesMap);
Utils.nonNull(transcriptSelectionMode);
Utils.nonNull(userTranscriptIdSet);

final List<DataSourceFuncotationFactory> dataSourceFactories = new ArrayList<>(dataSourceMetaData.size());

// Now we know we have unique and valid data.
// Now we must instantiate our data sources:
for ( final Map.Entry<Path, Properties> entry : dataSourceMetaData.entrySet() ) {

final String funcotationFactoryName = entry.getValue().getProperty(CONFIG_FILE_FIELD_NAME_NAME);
logger.debug("Creating Funcotation Factory for " + funcotationFactoryName + " ...");

final Path path = entry.getKey();
final Properties properties = entry.getValue();

final DataSourceFuncotationFactory funcotationFactory;

// Note: we need no default case since we know these are valid:
final String stringType = properties.getProperty("type");
final FeatureInput<? extends Feature> featureInput;
switch ( FuncotatorArgumentDefinitions.DataSourceType.getEnum(stringType) ) {
case LOCATABLE_XSV:
featureInput = createFeatureInputsForTesting(path, properties);
funcotationFactory = DataSourceUtils.createLocatableXsvDataSource(path, properties, annotationOverridesMap, featureInput);
break;
case SIMPLE_XSV:
funcotationFactory = DataSourceUtils.createSimpleXsvDataSource(path, properties, annotationOverridesMap);
break;
case COSMIC:
funcotationFactory = DataSourceUtils.createCosmicDataSource(path, properties, annotationOverridesMap);
break;
case GENCODE:
featureInput = createFeatureInputsForTesting(path, properties);
funcotationFactory = DataSourceUtils.createGencodeDataSource(path, properties, annotationOverridesMap, transcriptSelectionMode, userTranscriptIdSet, featureInput);
break;
case VCF:
featureInput = createFeatureInputsForTesting(path, properties);
funcotationFactory = DataSourceUtils.createVcfDataSource(path, properties, annotationOverridesMap, featureInput);
break;
default:
throw new GATKException("Unknown type of DataSourceFuncotationFactory encountered: " + stringType );
}

// Add in our factory:
dataSourceFactories.add(funcotationFactory);
}

logger.debug("All Data Sources have been created.");
return dataSourceFactories;
}

private static FeatureInput<? extends Feature> createAndRegisterFeatureInputs(final Path dataSourceFile,
final Properties dataSourceProperties,
final GATKTool funcotatorToolInstance,
Expand Down Expand Up @@ -428,9 +360,7 @@ private static LocatableXsvFuncotationFactory createLocatableXsvDataSource(final
locatableXsvFuncotationFactory.setSupportedFuncotationFields(
new ArrayList<>(
Collections.singletonList(
dataSourceFile.resolveSibling(
IOUtils.getPath( dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_SRC_FILE) )
)
resolveFilePathStringFromKnownPath(dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_SRC_FILE), dataSourceFile)
)
)
);
Expand All @@ -456,7 +386,7 @@ private static SimpleKeyXsvFuncotationFactory createSimpleXsvDataSource(final Pa
// Create our SimpleKeyXsvFuncotationFactory:
return new SimpleKeyXsvFuncotationFactory(
dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_NAME),
dataSourceFile.resolveSibling(IOUtils.getPath(dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_SRC_FILE))),
resolveFilePathStringFromKnownPath(dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_SRC_FILE), dataSourceFile),
dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_VERSION),
dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_XSV_DELIMITER),
Integer.valueOf(dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_XSV_KEY_COLUMN)),
Expand Down Expand Up @@ -484,7 +414,7 @@ private static CosmicFuncotationFactory createCosmicDataSource(final Path dataSo
final String version = dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_VERSION);

return new CosmicFuncotationFactory(
dataSourceFile.resolveSibling(IOUtils.getPath(dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_SRC_FILE))),
resolveFilePathStringFromKnownPath(dataSourceProperties.getProperty(CONFIG_FILE_FIELD_NAME_SRC_FILE), dataSourceFile),
annotationOverridesMap,
version
);
Expand Down Expand Up @@ -520,7 +450,7 @@ private static GencodeFuncotationFactory createGencodeDataSource(final Path data

// Create our gencode factory:
return new GencodeFuncotationFactory(
dataSourceFile.resolveSibling(fastaPath),
resolveFilePathStringFromKnownPath( fastaPath, dataSourceFile ),
version,
name,
transcriptSelectionMode,
Expand Down Expand Up @@ -556,7 +486,7 @@ private static VcfFuncotationFactory createVcfDataSource(final Path dataSourceFi
return new VcfFuncotationFactory(
name,
version,
dataSourceFile.resolveSibling(srcFile).toAbsolutePath(),
resolveFilePathStringFromKnownPath(srcFile, dataSourceFile),
annotationOverridesMap,
featureInput
);
Expand Down Expand Up @@ -753,7 +683,7 @@ private static void assertConfigFilePropertiesAreValid(final Properties configFi
assertConfigPropertiesContainsKey(CONFIG_FILE_FIELD_NAME_TYPE, configFileProperties, configFilePath);

// Validate our source file:
assertPathFilePropertiesField( configFileProperties, CONFIG_FILE_FIELD_NAME_SRC_FILE, configFilePath);
assertPathFilePropertiesField(configFileProperties, CONFIG_FILE_FIELD_NAME_SRC_FILE, configFilePath);

// Validate our type:
final String stringType = configFileProperties.getProperty(CONFIG_FILE_FIELD_NAME_TYPE);
Expand Down Expand Up @@ -814,25 +744,46 @@ public static void assertBooleanPropertiesField(final Properties props, final St
}
}

private static Path resolveFilePathStringFromKnownPath(final String filePathString, final Path knownPath ) {

final Path rawFilePath = IOUtils.getPath(filePathString);

final Path absoluteFilePath;
if ( rawFilePath.isAbsolute() || (!rawFilePath.getFileSystem().equals(FileSystems.getDefault()))) {
// Absolute path or different file system.
// No need to resolve anything.
absoluteFilePath = rawFilePath;
}
else {
// If the path is not absolute, assume we must resolve it with our config file path:
absoluteFilePath = knownPath.resolveSibling(filePathString);
logger.info("Resolved local data source file path: " + rawFilePath.toUri().toString() + " -> " + absoluteFilePath.toUri().toString());
}
return absoluteFilePath;
}

/**
* Asserts that the given {@code field} is contained in the given {@code props} and is a file path.
* @param props {@link Properties} corresponding to the given {@code filePath} in which to check for the validity of {@code field}.
* @param props {@link Properties} corresponding to the given {@code configFilePath} in which to check for the validity of {@code field}.
* @param field {@link String} name of the field, the existence and correct type of which will be confirmed in {@code props}.
* @param filePath {@link Path} to config file. For output purposes only.
* @param configFilePath {@link Path} to config file. For output purposes only.
*/
public static void assertPathFilePropertiesField(final Properties props, final String field, final Path filePath) {
final Path sourceFilePath = filePath.resolveSibling(props.getProperty(field));
if ( !Files.exists(sourceFilePath) ) {
throw new UserException.BadInput("ERROR in config file: " + filePath.toUri().toString() +
" - " + field + " does not exist: " + sourceFilePath);
public static void assertPathFilePropertiesField(final Properties props, final String field, final Path configFilePath) {

final String filePathString = props.getProperty(field);
final Path absoluteFilePath = resolveFilePathStringFromKnownPath(filePathString, configFilePath);

if ( !Files.exists(absoluteFilePath) ) {
throw new UserException.BadInput("ERROR in config file: " + configFilePath.toUri().toString() +
" - " + field + " does not exist: " + absoluteFilePath);
}
else if ( !Files.isRegularFile(sourceFilePath) ) {
throw new UserException.BadInput("ERROR in config file: " + filePath.toUri().toString() +
" - " + field + " is not a regular file: " + sourceFilePath);
else if ( !Files.isRegularFile(absoluteFilePath) ) {
throw new UserException.BadInput("ERROR in config file: " + configFilePath.toUri().toString() +
" - " + field + " is not a regular file: " + absoluteFilePath);
}
else if ( !Files.isReadable(sourceFilePath) ) {
throw new UserException.BadInput("ERROR in config file: " + filePath.toUri().toString() +
" - " + field + " is not readable: " + sourceFilePath);
else if ( !Files.isReadable(absoluteFilePath) ) {
throw new UserException.BadInput("ERROR in config file: " + configFilePath.toUri().toString() +
" - " + field + " is not readable: " + absoluteFilePath);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,8 +264,7 @@ public static String randomRemotePath(String stagingLocation, String prefix, Str
*/
public static boolean fileExists(String path) {
final boolean MAYBE = false;
try {
InputStream inputStream = openFile(path);
try (InputStream inputStream = openFile(path)) {
int ignored = inputStream.read();
} catch (UserException.CouldNotReadInputFile notthere) {
// file isn't there
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public class FuncotatorIntegrationTest extends CommandLineProgramTest {
// Whether to do debug output (i.e. leave output around).
// This should always be false when checked in.
// These tests would take ~30 minutes to complete each.
private static final boolean enableFullScaleValidationTest = false;
private static final boolean enableFullScaleValidationTest = true;
private static final String LARGE_DATASOURCES_FOLDER = "funcotator_dataSources_latest";
private static final String GERMLINE_DATASOURCES_FOLDER = "funcotator_dataSources_germline_latest";

Expand Down Expand Up @@ -392,47 +392,53 @@ Iterator<Object[]> provideForIntegrationTest() {
public Object[][] provideForLargeDataValidationTest() {
return new Object[][]{
{
"0816201804HC0_R01C01.vcf",
b37Reference,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
GERMLINE_DATASOURCES_FOLDER
},
{
"hg38_test_variants.vcf",
"tmp.hg38.vcf",
hg38Reference,
FuncotatorTestConstants.REFERENCE_VERSION_HG38,
LARGE_DATASOURCES_FOLDER
},
{
"hg38_trio.vcf",
hg38Reference,
FuncotatorTestConstants.REFERENCE_VERSION_HG38,
LARGE_DATASOURCES_FOLDER
},
{
FuncotatorTestConstants.NON_TRIVIAL_DATA_VALIDATION_TEST_HG19_DATA_SET_1,
b37Reference,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
FuncotatorTestConstants.FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER,
},
{
FuncotatorTestConstants.NON_TRIVIAL_DATA_VALIDATION_TEST_HG19_DATA_SET_2,
b37Reference,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
FuncotatorTestConstants.FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER
},
{
FuncotatorTestConstants.NON_TRIVIAL_DATA_VALIDATION_TEST_HG38,
hg38Reference,
FuncotatorTestConstants.REFERENCE_VERSION_HG38,
FuncotatorTestConstants.FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER
},
{
FuncotatorTestConstants.NON_TRIVIAL_DATA_VALIDATION_TEST_HG19_LARGE_DATA_SET,
b37Reference,
FuncotatorTestConstants.REFERENCE_VERSION_HG19,
FuncotatorTestConstants.FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER
},
// {
// "0816201804HC0_R01C01.vcf",
// b37Reference,
// FuncotatorTestConstants.REFERENCE_VERSION_HG19,
// GERMLINE_DATASOURCES_FOLDER
// },
// {
// "hg38_test_variants.vcf",
// hg38Reference,
// FuncotatorTestConstants.REFERENCE_VERSION_HG38,
// LARGE_DATASOURCES_FOLDER
// },
// {
// "hg38_trio.vcf",
// hg38Reference,
// FuncotatorTestConstants.REFERENCE_VERSION_HG38,
// LARGE_DATASOURCES_FOLDER
// },
// {
// FuncotatorTestConstants.NON_TRIVIAL_DATA_VALIDATION_TEST_HG19_DATA_SET_1,
// b37Reference,
// FuncotatorTestConstants.REFERENCE_VERSION_HG19,
// FuncotatorTestConstants.FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER,
// },
// {
// FuncotatorTestConstants.NON_TRIVIAL_DATA_VALIDATION_TEST_HG19_DATA_SET_2,
// b37Reference,
// FuncotatorTestConstants.REFERENCE_VERSION_HG19,
// FuncotatorTestConstants.FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER
// },
// {
// FuncotatorTestConstants.NON_TRIVIAL_DATA_VALIDATION_TEST_HG38,
// hg38Reference,
// FuncotatorTestConstants.REFERENCE_VERSION_HG38,
// FuncotatorTestConstants.FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER
// },
// {
// FuncotatorTestConstants.NON_TRIVIAL_DATA_VALIDATION_TEST_HG19_LARGE_DATA_SET,
// b37Reference,
// FuncotatorTestConstants.REFERENCE_VERSION_HG19,
// FuncotatorTestConstants.FUNCOTATOR_DATA_SOURCES_MAIN_FOLDER
// },
};
}

Expand Down

0 comments on commit 1711e2c

Please sign in to comment.