From 4b8dad237f154ac2762e8ec8e1a0cebf5352cf8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 8 Oct 2024 14:16:06 +0100 Subject: [PATCH 01/66] storage: Add variant-walker tool #TASK-6722 --- .../analysis/variant/VariantExportTool.java | 4 + .../analysis/variant/VariantWalkerTool.java | 94 +++++ .../manager/VariantStorageManager.java | 38 ++ .../VariantExportOperationManager.java | 21 +- .../app/cli/main/OpenCgaCompleter.java | 2 +- .../app/cli/main/OpencgaCliOptionsParser.java | 1 + .../AnalysisVariantCommandExecutor.java | 130 ++++++ .../AnalysisVariantCommandOptions.java | 304 ++++++++++++++ opencga-client/src/main/R/R/Variant-methods.R | 18 + .../client/rest/clients/VariantClient.java | 26 ++ opencga-client/src/main/javascript/Variant.js | 22 + .../pyopencga/rest_clients/variant_client.py | 31 ++ .../core/cellbase/CellBaseValidator.java | 27 +- .../core/config/ConfigurationOption.java | 4 + .../models/variant/VariantWalkerParams.java | 75 ++++ .../rest/analysis/AnalysisWebService.java | 5 - .../rest/analysis/VariantWebService.java | 22 + .../core/variant/VariantStorageEngine.java | 45 +- .../core/variant/VariantStorageOptions.java | 22 + .../core/variant/io/VariantWriterFactory.java | 8 + .../dummy/DummyVariantStorageEngine.java | 8 + .../hadoop/utils/AbstractHBaseDriver.java | 27 +- .../utils/ValueOnlyTextOutputFormat.java | 33 ++ .../variant/HadoopVariantStorageEngine.java | 19 + .../hadoop/variant/io/VariantDriver.java | 177 ++++++++ .../variant/io/VariantExporterDriver.java | 167 ++------ .../variant/mr/StreamVariantDriver.java | 140 +++++++ .../variant/mr/StreamVariantMapper.java | 394 ++++++++++++++++++ .../src/main/python/variant_walker.py | 157 +++++++ .../src/main/python/walker_example.py | 51 +++ .../VariantHadoopStoragePipelineTest.java | 49 +++ .../variant/io/HadoopVariantExporterTest.java | 2 +- .../src/test/resources/gaps/file1.genome.vcf | 1 + .../src/test/resources/gaps/file2.genome.vcf | 1 + 34 files changed, 1950 insertions(+), 175 deletions(-) create mode 100644 opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java create mode 100644 opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantWalkerParams.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/ValueOnlyTextOutputFormat.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/variant_walker.py create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/walker_example.py diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantExportTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantExportTool.java index ea6bb73998a..b0a2005ac18 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantExportTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantExportTool.java @@ -72,6 +72,9 @@ protected List getSteps() { protected void run() throws Exception { List uris = new ArrayList<>(2); step(ID, () -> { + // Use scratch directory to store intermediate files. Move files to final directory at the end + // The scratch directory is expected to be faster than the final directory + // This also avoids moving files to final directory if the tool fails Path outDir = getScratchDir(); String outputFile = StringUtils.isEmpty(toolParams.getOutputFileName()) ? outDir.toString() @@ -86,6 +89,7 @@ protected void run() throws Exception { toolParams.getVariantsFile(), query, queryOptions, token)); }); step("move-files", () -> { + // Move files to final directory IOManager ioManager = catalogManager.getIoManagerFactory().get(uris.get(0)); for (URI uri : uris) { String fileName = UriUtils.fileName(uri); diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java new file mode 100644 index 00000000000..3e826de405b --- /dev/null +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java @@ -0,0 +1,94 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.opencga.analysis.variant; + +import org.apache.solr.common.StringUtils; +import org.opencb.commons.datastore.core.Query; +import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.opencga.analysis.tools.OpenCgaTool; +import org.opencb.opencga.catalog.io.IOManager; +import org.opencb.opencga.core.common.UriUtils; +import org.opencb.opencga.core.models.common.Enums; +import org.opencb.opencga.core.models.variant.VariantWalkerParams; +import org.opencb.opencga.core.tools.annotations.Tool; +import org.opencb.opencga.core.tools.annotations.ToolParams; +import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; +import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; + +import java.net.URI; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +@Tool(id = VariantWalkerTool.ID, description = VariantWalkerTool.DESCRIPTION, + scope = Tool.Scope.PROJECT, resource = Enums.Resource.VARIANT) +public class VariantWalkerTool extends OpenCgaTool { + public static final String ID = "variant-walk"; + public static final String DESCRIPTION = "Filter and walk variants from the variant storage to produce a file"; + + @ToolParams + protected VariantWalkerParams toolParams = new VariantWalkerParams(); + + private VariantWriterFactory.VariantOutputFormat format; + + @Override + protected void check() throws Exception { + super.check(); + + if (StringUtils.isEmpty(toolParams.getFileFormat())) { + toolParams.setFileFormat(VariantWriterFactory.VariantOutputFormat.VCF.toString()); + } + + format = VariantWriterFactory.toOutputFormat(toolParams.getOutputFileName(), toolParams.getOutputFileName()); + } + + @Override + protected List getSteps() { + return Arrays.asList(ID, "move-files"); + } + + @Override + protected void run() throws Exception { + List uris = new ArrayList<>(2); + step(ID, () -> { + // Use scratch directory to store intermediate files. Move files to final directory at the end + // The scratch directory is expected to be faster than the final directory + // This also avoids moving files to final directory if the tool fails + Path outDir = getScratchDir(); + String outputFile = StringUtils.isEmpty(toolParams.getOutputFileName()) + ? outDir.toString() + : outDir.resolve(toolParams.getOutputFileName()).toString(); + Query query = toolParams.toQuery(); + QueryOptions queryOptions = new QueryOptions(params); + for (VariantQueryParam param : VariantQueryParam.values()) { + queryOptions.remove(param.key()); + } + uris.addAll(variantStorageManager.walkData(outputFile, + format, query, queryOptions, toolParams.getDockerImage(), toolParams.getCommandLine(), token)); + }); + step("move-files", () -> { + // Move files to final directory + IOManager ioManager = catalogManager.getIoManagerFactory().get(uris.get(0)); + for (URI uri : uris) { + String fileName = UriUtils.fileName(uri); + logger.info("Moving file -- " + fileName); + ioManager.move(uri, getOutDir().resolve(fileName).toUri()); + } + }); + } +} diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java index bd71414a4e8..d1e276fbf34 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java @@ -37,6 +37,7 @@ import org.opencb.commons.datastore.solr.SolrManager; import org.opencb.opencga.analysis.StorageManager; import org.opencb.opencga.analysis.variant.VariantExportTool; +import org.opencb.opencga.analysis.variant.VariantWalkerTool; import org.opencb.opencga.analysis.variant.manager.operations.*; import org.opencb.opencga.analysis.variant.metadata.CatalogStorageMetadataSynchronizer; import org.opencb.opencga.analysis.variant.metadata.CatalogVariantMetadataFactory; @@ -187,6 +188,32 @@ public List exportData(String outputFile, VariantOutputFormat outputFormat, }); } + /** + * Exports the result of the given query and the associated metadata. + * + * @param outputFile Optional output file. If null or empty, will print into the Standard output. Won't export any metadata. + * @param format Variant Output format. + * @param query Query with the variants to export + * @param queryOptions Query options + * @param dockerImage Docker image to use + * @param commandLine Command line to use + * @param token User's session id + * @throws CatalogException if there is any error with Catalog + * @throws StorageEngineException If there is any error exporting variants + * @return generated files + */ + public List walkData(String outputFile, VariantOutputFormat format, + Query query, QueryOptions queryOptions, String dockerImage, String commandLine, String token) + throws CatalogException, StorageEngineException { + String anyStudy = catalogUtils.getAnyStudy(query, token); + return secureAnalysis(VariantWalkerTool.ID, anyStudy, queryOptions, token, engine -> { + Query finalQuery = catalogUtils.parseQuery(query, queryOptions, engine.getCellBaseUtils(), token); + checkSamplesPermissions(finalQuery, queryOptions, token); + URI outputUri = new VariantExportOperationManager(this, engine).getOutputUri(outputFile, format, finalQuery, token); + return engine.walkData(outputUri, format, finalQuery, queryOptions, dockerImage, commandLine); + }); + } + // --------------------------// // Data Operation methods // // --------------------------// @@ -506,6 +533,8 @@ public boolean hasVariantSetup(String studyStr, String token) throws CatalogExce public ObjectMap configureProject(String projectStr, ObjectMap params, String token) throws CatalogException, StorageEngineException { return secureOperationByProject("configure", projectStr, params, token, engine -> { + validateNewConfiguration(engine, params); + DataStore dataStore = getDataStoreByProjectId(projectStr, token); dataStore.getOptions().putAll(params); @@ -517,6 +546,7 @@ public ObjectMap configureProject(String projectStr, ObjectMap params, String to public ObjectMap configureStudy(String studyStr, ObjectMap params, String token) throws CatalogException, StorageEngineException { return secureOperation("configure", studyStr, params, token, engine -> { + validateNewConfiguration(engine, params); Study study = catalogManager.getStudyManager() .get(studyStr, new QueryOptions(INCLUDE, StudyDBAdaptor.QueryParams.INTERNAL_CONFIGURATION_VARIANT_ENGINE_OPTIONS.key()), @@ -540,6 +570,14 @@ public ObjectMap configureStudy(String studyStr, ObjectMap params, String token) }); } + private void validateNewConfiguration(VariantStorageEngine engine, ObjectMap params) throws StorageEngineException { + for (VariantStorageOptions option : VariantStorageOptions.values()) { + if (option.isProtected() && params.get(option.key()) != null) { + throw new StorageEngineException("Unable to update protected option '" + option.key() + "'"); + } + } + } + /** * Modify SampleIndex configuration. Automatically submit a job to rebuild the sample index. * diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/operations/VariantExportOperationManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/operations/VariantExportOperationManager.java index 54ca3d11113..880d0232a82 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/operations/VariantExportOperationManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/operations/VariantExportOperationManager.java @@ -30,6 +30,7 @@ import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; +import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.nio.file.Paths; @@ -48,6 +49,17 @@ public VariantExportOperationManager(VariantStorageManager variantStorageManager public List export(String outputFileStr, VariantWriterFactory.VariantOutputFormat outputFormat, String variantsFile, Query query, QueryOptions queryOptions, String token) throws Exception { + URI outputFile = getOutputUri(outputFileStr, outputFormat, query, token); + + VariantMetadataFactory metadataExporter = + new CatalogVariantMetadataFactory(catalogManager, variantStorageEngine.getDBAdaptor(), token); + + URI variantsFileUri = StringUtils.isEmpty(variantsFile) ? null : UriUtils.createUri(variantsFile); + return variantStorageEngine.exportData(outputFile, outputFormat, variantsFileUri, query, queryOptions, metadataExporter); + } + + public URI getOutputUri(String outputFileStr, VariantWriterFactory.VariantOutputFormat format, Query query, String token) + throws CatalogException, IOException { URI outputFile; if (!VariantWriterFactory.isStandardOutput(outputFileStr)) { URI outdirUri; @@ -71,19 +83,14 @@ public List export(String outputFileStr, VariantWriterFactory.VariantOutput outputFileName = buildOutputFileName(query, token); } outputFile = outdirUri.resolve(outputFileName); - outputFile = VariantWriterFactory.checkOutput(outputFile, outputFormat); + outputFile = VariantWriterFactory.checkOutput(outputFile, format); } else { outputFile = outdirUri; } } else { outputFile = null; } - - VariantMetadataFactory metadataExporter = - new CatalogVariantMetadataFactory(catalogManager, variantStorageEngine.getDBAdaptor(), token); - - URI variantsFileUri = StringUtils.isEmpty(variantsFile) ? null : UriUtils.createUri(variantsFile); - return variantStorageEngine.exportData(outputFile, outputFormat, variantsFileUri, query, queryOptions, metadataExporter); + return outputFile; } private String buildOutputFileName(Query query, String token) throws CatalogException { diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpenCgaCompleter.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpenCgaCompleter.java index fa5b3482284..6197b049393 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpenCgaCompleter.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpenCgaCompleter.java @@ -29,7 +29,7 @@ public abstract class OpenCgaCompleter implements Completer { .map(Candidate::new) .collect(toList()); - private List variantList = asList( "aggregationstats","annotation-metadata","annotation-query","circos-run","cohort-stats-delete","cohort-stats-info","cohort-stats-run","exomiser-run","export-run","family-genotypes","family-qc-run","file-delete","gatk-run","genome-plot-run","gwas-run","hr-detect-run","index-run","individual-qc-run","inferred-sex-run","knockout-gene-query","knockout-individual-query","knockout-run","mendelian-error-run","metadata","mutational-signature-query","mutational-signature-run","plink-run","query","relatedness-run","rvtests-run","sample-aggregation-stats","sample-eligibility-run","sample-qc-run","sample-query","sample-run","sample-stats-query","sample-stats-run","stats-export-run","stats-run") + private List variantList = asList( "aggregationstats","annotation-metadata","annotation-query","circos-run","cohort-stats-delete","cohort-stats-info","cohort-stats-run","exomiser-run","export-run","family-genotypes","family-qc-run","file-delete","gatk-run","genome-plot-run","gwas-run","hr-detect-run","index-run","individual-qc-run","inferred-sex-run","knockout-gene-query","knockout-individual-query","knockout-run","mendelian-error-run","metadata","mutational-signature-query","mutational-signature-run","plink-run","query","relatedness-run","rvtests-run","sample-aggregation-stats","sample-eligibility-run","sample-qc-run","sample-query","sample-run","sample-stats-query","sample-stats-run","stats-export-run","stats-run","walker-run") .stream() .map(Candidate::new) .collect(toList()); diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpencgaCliOptionsParser.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpencgaCliOptionsParser.java index cfd4cee9919..eb95cf2fd17 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpencgaCliOptionsParser.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpencgaCliOptionsParser.java @@ -84,6 +84,7 @@ public OpencgaCliOptionsParser() { analysisVariantSubCommands.addCommand("sample-stats-run", analysisVariantCommandOptions.runSampleStatsCommandOptions); analysisVariantSubCommands.addCommand("stats-export-run", analysisVariantCommandOptions.runStatsExportCommandOptions); analysisVariantSubCommands.addCommand("stats-run", analysisVariantCommandOptions.runStatsCommandOptions); + analysisVariantSubCommands.addCommand("walker-run", analysisVariantCommandOptions.runWalkerCommandOptions); projectsCommandOptions = new ProjectsCommandOptions(commonCommandOptions, jCommander); jCommander.addCommand("projects", projectsCommandOptions); diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/AnalysisVariantCommandExecutor.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/AnalysisVariantCommandExecutor.java index 57edea737fa..3e3d5cd7528 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/AnalysisVariantCommandExecutor.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/AnalysisVariantCommandExecutor.java @@ -52,6 +52,7 @@ import org.opencb.opencga.core.models.variant.SampleVariantStatsAnalysisParams; import org.opencb.opencga.core.models.variant.VariantExportParams; import org.opencb.opencga.core.models.variant.VariantStatsAnalysisParams; +import org.opencb.opencga.core.models.variant.VariantWalkerParams; import org.opencb.opencga.core.response.QueryType; import org.opencb.opencga.core.response.RestResponse; import org.opencb.oskar.analysis.variant.gwas.GwasConfiguration; @@ -207,6 +208,9 @@ public void execute() throws Exception { case "stats-run": queryResponse = runStats(); break; + case "walker-run": + queryResponse = runWalker(); + break; default: logger.error("Subcommand not valid"); break; @@ -1874,4 +1878,130 @@ private RestResponse runStats() throws Exception { } return openCGAClient.getVariantClient().runStats(variantStatsAnalysisParams, queryParams); } + + private RestResponse runWalker() throws Exception { + logger.debug("Executing runWalker in Analysis - Variant command line"); + + AnalysisVariantCommandOptions.RunWalkerCommandOptions commandOptions = analysisVariantCommandOptions.runWalkerCommandOptions; + + ObjectMap queryParams = new ObjectMap(); + queryParams.putIfNotEmpty("include", commandOptions.include); + queryParams.putIfNotEmpty("exclude", commandOptions.exclude); + queryParams.putIfNotEmpty("project", commandOptions.project); + queryParams.putIfNotEmpty("study", commandOptions.study); + queryParams.putIfNotEmpty("jobId", commandOptions.jobId); + queryParams.putIfNotEmpty("jobDescription", commandOptions.jobDescription); + queryParams.putIfNotEmpty("jobDependsOn", commandOptions.jobDependsOn); + queryParams.putIfNotEmpty("jobTags", commandOptions.jobTags); + queryParams.putIfNotEmpty("jobScheduledStartTime", commandOptions.jobScheduledStartTime); + queryParams.putIfNotEmpty("jobPriority", commandOptions.jobPriority); + queryParams.putIfNotNull("jobDryRun", commandOptions.jobDryRun); + if (queryParams.get("study") == null && OpencgaMain.isShellMode()) { + queryParams.putIfNotEmpty("study", sessionManager.getSession().getCurrentStudy()); + } + + + VariantWalkerParams variantWalkerParams = null; + if (commandOptions.jsonDataModel) { + RestResponse res = new RestResponse<>(); + res.setType(QueryType.VOID); + PrintUtils.println(getObjectAsJSON(categoryName,"/{apiVersion}/analysis/variant/walker/run")); + return res; + } else if (commandOptions.jsonFile != null) { + variantWalkerParams = JacksonUtils.getDefaultObjectMapper() + .readValue(new java.io.File(commandOptions.jsonFile), VariantWalkerParams.class); + } else { + ObjectMap beanParams = new ObjectMap(); + putNestedIfNotEmpty(beanParams, "id", commandOptions.id, true); + putNestedIfNotEmpty(beanParams, "region", commandOptions.region, true); + putNestedIfNotEmpty(beanParams, "gene", commandOptions.gene, true); + putNestedIfNotEmpty(beanParams, "type", commandOptions.type, true); + putNestedIfNotEmpty(beanParams, "panel", commandOptions.panel, true); + putNestedIfNotEmpty(beanParams, "panelModeOfInheritance", commandOptions.panelModeOfInheritance, true); + putNestedIfNotEmpty(beanParams, "panelConfidence", commandOptions.panelConfidence, true); + putNestedIfNotEmpty(beanParams, "panelRoleInCancer", commandOptions.panelRoleInCancer, true); + putNestedIfNotNull(beanParams, "panelIntersection", commandOptions.panelIntersection, true); + putNestedIfNotEmpty(beanParams, "panelFeatureType", commandOptions.panelFeatureType, true); + putNestedIfNotEmpty(beanParams, "cohortStatsRef", commandOptions.cohortStatsRef, true); + putNestedIfNotEmpty(beanParams, "cohortStatsAlt", commandOptions.cohortStatsAlt, true); + putNestedIfNotEmpty(beanParams, "cohortStatsMaf", commandOptions.cohortStatsMaf, true); + putNestedIfNotEmpty(beanParams, "ct", commandOptions.ct, true); + putNestedIfNotEmpty(beanParams, "xref", commandOptions.xref, true); + putNestedIfNotEmpty(beanParams, "biotype", commandOptions.biotype, true); + putNestedIfNotEmpty(beanParams, "proteinSubstitution", commandOptions.proteinSubstitution, true); + putNestedIfNotEmpty(beanParams, "conservation", commandOptions.conservation, true); + putNestedIfNotEmpty(beanParams, "populationFrequencyMaf", commandOptions.populationFrequencyMaf, true); + putNestedIfNotEmpty(beanParams, "populationFrequencyAlt", commandOptions.populationFrequencyAlt, true); + putNestedIfNotEmpty(beanParams, "populationFrequencyRef", commandOptions.populationFrequencyRef, true); + putNestedIfNotEmpty(beanParams, "transcriptFlag", commandOptions.transcriptFlag, true); + putNestedIfNotEmpty(beanParams, "functionalScore", commandOptions.functionalScore, true); + putNestedIfNotEmpty(beanParams, "clinical", commandOptions.clinical, true); + putNestedIfNotEmpty(beanParams, "clinicalSignificance", commandOptions.clinicalSignificance, true); + putNestedIfNotNull(beanParams, "clinicalConfirmedStatus", commandOptions.clinicalConfirmedStatus, true); + putNestedIfNotEmpty(beanParams, "project", commandOptions.bodyProject, true); + putNestedIfNotEmpty(beanParams, "study", commandOptions.bodyStudy, true); + putNestedIfNotEmpty(beanParams, "savedFilter", commandOptions.savedFilter, true); + putNestedIfNotEmpty(beanParams, "chromosome", commandOptions.chromosome, true); + putNestedIfNotEmpty(beanParams, "reference", commandOptions.reference, true); + putNestedIfNotEmpty(beanParams, "alternate", commandOptions.alternate, true); + putNestedIfNotEmpty(beanParams, "release", commandOptions.release, true); + putNestedIfNotEmpty(beanParams, "includeStudy", commandOptions.includeStudy, true); + putNestedIfNotEmpty(beanParams, "includeSample", commandOptions.includeSample, true); + putNestedIfNotEmpty(beanParams, "includeFile", commandOptions.includeFile, true); + putNestedIfNotEmpty(beanParams, "includeSampleData", commandOptions.includeSampleData, true); + putNestedIfNotNull(beanParams, "includeSampleId", commandOptions.includeSampleId, true); + putNestedIfNotNull(beanParams, "includeGenotype", commandOptions.includeGenotype, true); + putNestedIfNotEmpty(beanParams, "file", commandOptions.file, true); + putNestedIfNotEmpty(beanParams, "qual", commandOptions.qual, true); + putNestedIfNotEmpty(beanParams, "filter", commandOptions.filter, true); + putNestedIfNotEmpty(beanParams, "fileData", commandOptions.fileData, true); + putNestedIfNotEmpty(beanParams, "genotype", commandOptions.genotype, true); + putNestedIfNotEmpty(beanParams, "sample", commandOptions.sample, true); + putNestedIfNotNull(beanParams, "sampleLimit", commandOptions.sampleLimit, true); + putNestedIfNotNull(beanParams, "sampleSkip", commandOptions.sampleSkip, true); + putNestedIfNotEmpty(beanParams, "sampleData", commandOptions.sampleData, true); + putNestedIfNotEmpty(beanParams, "sampleAnnotation", commandOptions.sampleAnnotation, true); + putNestedIfNotEmpty(beanParams, "family", commandOptions.family, true); + putNestedIfNotEmpty(beanParams, "familyMembers", commandOptions.familyMembers, true); + putNestedIfNotEmpty(beanParams, "familyDisorder", commandOptions.familyDisorder, true); + putNestedIfNotEmpty(beanParams, "familyProband", commandOptions.familyProband, true); + putNestedIfNotEmpty(beanParams, "familySegregation", commandOptions.familySegregation, true); + putNestedIfNotEmpty(beanParams, "cohort", commandOptions.cohort, true); + putNestedIfNotEmpty(beanParams, "cohortStatsPass", commandOptions.cohortStatsPass, true); + putNestedIfNotEmpty(beanParams, "cohortStatsMgf", commandOptions.cohortStatsMgf, true); + putNestedIfNotEmpty(beanParams, "missingAlleles", commandOptions.missingAlleles, true); + putNestedIfNotEmpty(beanParams, "missingGenotypes", commandOptions.missingGenotypes, true); + putNestedIfNotNull(beanParams, "annotationExists", commandOptions.annotationExists, true); + putNestedIfNotEmpty(beanParams, "score", commandOptions.score, true); + putNestedIfNotEmpty(beanParams, "polyphen", commandOptions.polyphen, true); + putNestedIfNotEmpty(beanParams, "sift", commandOptions.sift, true); + putNestedIfNotEmpty(beanParams, "geneRoleInCancer", commandOptions.geneRoleInCancer, true); + putNestedIfNotEmpty(beanParams, "geneTraitId", commandOptions.geneTraitId, true); + putNestedIfNotEmpty(beanParams, "geneTraitName", commandOptions.geneTraitName, true); + putNestedIfNotEmpty(beanParams, "trait", commandOptions.trait, true); + putNestedIfNotEmpty(beanParams, "cosmic", commandOptions.cosmic, true); + putNestedIfNotEmpty(beanParams, "clinvar", commandOptions.clinvar, true); + putNestedIfNotEmpty(beanParams, "hpo", commandOptions.hpo, true); + putNestedIfNotEmpty(beanParams, "go", commandOptions.go, true); + putNestedIfNotEmpty(beanParams, "expression", commandOptions.expression, true); + putNestedIfNotEmpty(beanParams, "proteinKeyword", commandOptions.proteinKeyword, true); + putNestedIfNotEmpty(beanParams, "drug", commandOptions.drug, true); + putNestedIfNotEmpty(beanParams, "customAnnotation", commandOptions.customAnnotation, true); + putNestedIfNotEmpty(beanParams, "unknownGenotype", commandOptions.unknownGenotype, true); + putNestedIfNotNull(beanParams, "sampleMetadata", commandOptions.sampleMetadata, true); + putNestedIfNotNull(beanParams, "sort", commandOptions.sort, true); + putNestedIfNotEmpty(beanParams, "outdir", commandOptions.outdir, true); + putNestedIfNotEmpty(beanParams, "outputFileName", commandOptions.outputFileName, true); + putNestedIfNotEmpty(beanParams, "fileFormat", commandOptions.fileFormat, true); + putNestedIfNotEmpty(beanParams, "dockerImage", commandOptions.dockerImage, true); + putNestedIfNotEmpty(beanParams, "commandLine", commandOptions.commandLine, true); + putNestedIfNotEmpty(beanParams, "include", commandOptions.bodyInclude, true); + putNestedIfNotEmpty(beanParams, "exclude", commandOptions.bodyExclude, true); + + variantWalkerParams = JacksonUtils.getDefaultObjectMapper().copy() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, true) + .readValue(beanParams.toJson(), VariantWalkerParams.class); + } + return openCGAClient.getVariantClient().runWalker(variantWalkerParams, queryParams); + } } \ No newline at end of file diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/AnalysisVariantCommandOptions.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/AnalysisVariantCommandOptions.java index 44a0ad64dd7..998a7dc510b 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/AnalysisVariantCommandOptions.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/AnalysisVariantCommandOptions.java @@ -72,6 +72,7 @@ public class AnalysisVariantCommandOptions { public RunSampleStatsCommandOptions runSampleStatsCommandOptions; public RunStatsExportCommandOptions runStatsExportCommandOptions; public RunStatsCommandOptions runStatsCommandOptions; + public RunWalkerCommandOptions runWalkerCommandOptions; public AnalysisVariantCommandOptions(CommonCommandOptions commonCommandOptions, JCommander jCommander) { @@ -117,6 +118,7 @@ public AnalysisVariantCommandOptions(CommonCommandOptions commonCommandOptions, this.runSampleStatsCommandOptions = new RunSampleStatsCommandOptions(); this.runStatsExportCommandOptions = new RunStatsExportCommandOptions(); this.runStatsCommandOptions = new RunStatsCommandOptions(); + this.runWalkerCommandOptions = new RunWalkerCommandOptions(); } @@ -2832,4 +2834,306 @@ public class RunStatsCommandOptions { } + @Parameters(commandNames = {"walker-run"}, commandDescription ="Filter and walk variants from the variant storage to produce a file") + public class RunWalkerCommandOptions { + + @ParametersDelegate + public CommonCommandOptions commonOptions = commonCommandOptions; + + @Parameter(names = {"--json-file"}, description = "File with the body data in JSON format. Note, that using this parameter will ignore all the other parameters.", required = false, arity = 1) + public String jsonFile; + + @Parameter(names = {"--json-data-model"}, description = "Show example of file structure for body data.", help = true, arity = 0) + public Boolean jsonDataModel = false; + + @Parameter(names = {"--include", "-I"}, description = "Fields included in the response, whole JSON path must be provided", required = false, arity = 1) + public String include; + + @Parameter(names = {"--exclude", "-E"}, description = "Fields excluded in the response, whole JSON path must be provided", required = false, arity = 1) + public String exclude; + + @Parameter(names = {"--project", "-p"}, description = "Project [organization@]project where project can be either the ID or the alias", required = false, arity = 1) + public String project; + + @Parameter(names = {"--study", "-s"}, description = "Study [[organization@]project:]study where study and project can be either the ID or UUID", required = false, arity = 1) + public String study; + + @Parameter(names = {"--job-id"}, description = "Job ID. It must be a unique string within the study. An ID will be autogenerated automatically if not provided.", required = false, arity = 1) + public String jobId; + + @Parameter(names = {"--job-description"}, description = "Job description", required = false, arity = 1) + public String jobDescription; + + @Parameter(names = {"--job-depends-on"}, description = "Comma separated list of existing job IDs the job will depend on.", required = false, arity = 1) + public String jobDependsOn; + + @Parameter(names = {"--job-tags"}, description = "Job tags", required = false, arity = 1) + public String jobTags; + + @Parameter(names = {"--job-scheduled-start-time"}, description = "Time when the job is scheduled to start.", required = false, arity = 1) + public String jobScheduledStartTime; + + @Parameter(names = {"--job-priority"}, description = "Priority of the job", required = false, arity = 1) + public String jobPriority; + + @Parameter(names = {"--job-dry-run"}, description = "Flag indicating that the job will be executed in dry-run mode. In this mode, OpenCGA will validate that all parameters and prerequisites are correctly set for successful execution, but the job will not actually run.", required = false, arity = 1) + public Boolean jobDryRun; + + @Parameter(names = {"--id"}, description = "The body web service id parameter", required = false, arity = 1) + public String id; + + @Parameter(names = {"--region"}, description = "The body web service region parameter", required = false, arity = 1) + public String region; + + @Parameter(names = {"--gene"}, description = "The body web service gene parameter", required = false, arity = 1) + public String gene; + + @Parameter(names = {"--type"}, description = "The body web service type parameter", required = false, arity = 1) + public String type; + + @Parameter(names = {"--panel"}, description = "The body web service panel parameter", required = false, arity = 1) + public String panel; + + @Parameter(names = {"--panel-mode-of-inheritance"}, description = "The body web service panelModeOfInheritance parameter", required = false, arity = 1) + public String panelModeOfInheritance; + + @Parameter(names = {"--panel-confidence"}, description = "The body web service panelConfidence parameter", required = false, arity = 1) + public String panelConfidence; + + @Parameter(names = {"--panel-role-in-cancer"}, description = "The body web service panelRoleInCancer parameter", required = false, arity = 1) + public String panelRoleInCancer; + + @Parameter(names = {"--panel-intersection"}, description = "The body web service panelIntersection parameter", required = false, help = true, arity = 0) + public boolean panelIntersection = false; + + @Parameter(names = {"--panel-feature-type"}, description = "The body web service panelFeatureType parameter", required = false, arity = 1) + public String panelFeatureType; + + @Parameter(names = {"--cohort-stats-ref"}, description = "The body web service cohortStatsRef parameter", required = false, arity = 1) + public String cohortStatsRef; + + @Parameter(names = {"--cohort-stats-alt"}, description = "The body web service cohortStatsAlt parameter", required = false, arity = 1) + public String cohortStatsAlt; + + @Parameter(names = {"--cohort-stats-maf"}, description = "The body web service cohortStatsMaf parameter", required = false, arity = 1) + public String cohortStatsMaf; + + @Parameter(names = {"--ct"}, description = "The body web service ct parameter", required = false, arity = 1) + public String ct; + + @Parameter(names = {"--xref"}, description = "The body web service xref parameter", required = false, arity = 1) + public String xref; + + @Parameter(names = {"--biotype"}, description = "The body web service biotype parameter", required = false, arity = 1) + public String biotype; + + @Parameter(names = {"--protein-substitution"}, description = "The body web service proteinSubstitution parameter", required = false, arity = 1) + public String proteinSubstitution; + + @Parameter(names = {"--conservation"}, description = "The body web service conservation parameter", required = false, arity = 1) + public String conservation; + + @Parameter(names = {"--population-frequency-maf"}, description = "The body web service populationFrequencyMaf parameter", required = false, arity = 1) + public String populationFrequencyMaf; + + @Parameter(names = {"--population-frequency-alt"}, description = "The body web service populationFrequencyAlt parameter", required = false, arity = 1) + public String populationFrequencyAlt; + + @Parameter(names = {"--population-frequency-ref"}, description = "The body web service populationFrequencyRef parameter", required = false, arity = 1) + public String populationFrequencyRef; + + @Parameter(names = {"--transcript-flag"}, description = "The body web service transcriptFlag parameter", required = false, arity = 1) + public String transcriptFlag; + + @Parameter(names = {"--functional-score"}, description = "The body web service functionalScore parameter", required = false, arity = 1) + public String functionalScore; + + @Parameter(names = {"--clinical"}, description = "The body web service clinical parameter", required = false, arity = 1) + public String clinical; + + @Parameter(names = {"--clinical-significance"}, description = "The body web service clinicalSignificance parameter", required = false, arity = 1) + public String clinicalSignificance; + + @Parameter(names = {"--clinical-confirmed-status"}, description = "The body web service clinicalConfirmedStatus parameter", required = false, help = true, arity = 0) + public boolean clinicalConfirmedStatus = false; + + @Parameter(names = {"--body_project"}, description = "The body web service project parameter", required = false, arity = 1) + public String bodyProject; + + @Parameter(names = {"--body_study"}, description = "The body web service study parameter", required = false, arity = 1) + public String bodyStudy; + + @Parameter(names = {"--saved-filter"}, description = "The body web service savedFilter parameter", required = false, arity = 1) + public String savedFilter; + + @Parameter(names = {"--chromosome"}, description = "The body web service chromosome parameter", required = false, arity = 1) + public String chromosome; + + @Parameter(names = {"--reference"}, description = "The body web service reference parameter", required = false, arity = 1) + public String reference; + + @Parameter(names = {"--alternate"}, description = "The body web service alternate parameter", required = false, arity = 1) + public String alternate; + + @Parameter(names = {"--release"}, description = "The body web service release parameter", required = false, arity = 1) + public String release; + + @Parameter(names = {"--include-study"}, description = "The body web service includeStudy parameter", required = false, arity = 1) + public String includeStudy; + + @Parameter(names = {"--include-sample"}, description = "The body web service includeSample parameter", required = false, arity = 1) + public String includeSample; + + @Parameter(names = {"--include-file"}, description = "The body web service includeFile parameter", required = false, arity = 1) + public String includeFile; + + @Parameter(names = {"--include-sample-data"}, description = "The body web service includeSampleData parameter", required = false, arity = 1) + public String includeSampleData; + + @Parameter(names = {"--include-sample-id"}, description = "The body web service includeSampleId parameter", required = false, help = true, arity = 0) + public boolean includeSampleId = false; + + @Parameter(names = {"--include-genotype"}, description = "The body web service includeGenotype parameter", required = false, help = true, arity = 0) + public boolean includeGenotype = false; + + @Parameter(names = {"--file"}, description = "The body web service file parameter", required = false, arity = 1) + public String file; + + @Parameter(names = {"--qual"}, description = "The body web service qual parameter", required = false, arity = 1) + public String qual; + + @Parameter(names = {"--filter"}, description = "The body web service filter parameter", required = false, arity = 1) + public String filter; + + @Parameter(names = {"--file-data"}, description = "The body web service fileData parameter", required = false, arity = 1) + public String fileData; + + @Parameter(names = {"--genotype"}, description = "The body web service genotype parameter", required = false, arity = 1) + public String genotype; + + @Parameter(names = {"--sample"}, description = "The body web service sample parameter", required = false, arity = 1) + public String sample; + + @Parameter(names = {"--sample-limit"}, description = "The body web service sampleLimit parameter", required = false, arity = 1) + public Integer sampleLimit; + + @Parameter(names = {"--sample-skip"}, description = "The body web service sampleSkip parameter", required = false, arity = 1) + public Integer sampleSkip; + + @Parameter(names = {"--sample-data"}, description = "The body web service sampleData parameter", required = false, arity = 1) + public String sampleData; + + @Parameter(names = {"--sample-annotation"}, description = "The body web service sampleAnnotation parameter", required = false, arity = 1) + public String sampleAnnotation; + + @Parameter(names = {"--family"}, description = "The body web service family parameter", required = false, arity = 1) + public String family; + + @Parameter(names = {"--family-members"}, description = "The body web service familyMembers parameter", required = false, arity = 1) + public String familyMembers; + + @Parameter(names = {"--family-disorder"}, description = "The body web service familyDisorder parameter", required = false, arity = 1) + public String familyDisorder; + + @Parameter(names = {"--family-proband"}, description = "The body web service familyProband parameter", required = false, arity = 1) + public String familyProband; + + @Parameter(names = {"--family-segregation"}, description = "The body web service familySegregation parameter", required = false, arity = 1) + public String familySegregation; + + @Parameter(names = {"--cohort"}, description = "The body web service cohort parameter", required = false, arity = 1) + public String cohort; + + @Parameter(names = {"--cohort-stats-pass"}, description = "The body web service cohortStatsPass parameter", required = false, arity = 1) + public String cohortStatsPass; + + @Parameter(names = {"--cohort-stats-mgf"}, description = "The body web service cohortStatsMgf parameter", required = false, arity = 1) + public String cohortStatsMgf; + + @Parameter(names = {"--missing-alleles"}, description = "The body web service missingAlleles parameter", required = false, arity = 1) + public String missingAlleles; + + @Parameter(names = {"--missing-genotypes"}, description = "The body web service missingGenotypes parameter", required = false, arity = 1) + public String missingGenotypes; + + @Parameter(names = {"--annotation-exists"}, description = "The body web service annotationExists parameter", required = false, arity = 1) + public Boolean annotationExists; + + @Parameter(names = {"--score"}, description = "The body web service score parameter", required = false, arity = 1) + public String score; + + @Parameter(names = {"--polyphen"}, description = "The body web service polyphen parameter", required = false, arity = 1) + public String polyphen; + + @Parameter(names = {"--sift"}, description = "The body web service sift parameter", required = false, arity = 1) + public String sift; + + @Parameter(names = {"--gene-role-in-cancer"}, description = "The body web service geneRoleInCancer parameter", required = false, arity = 1) + public String geneRoleInCancer; + + @Parameter(names = {"--gene-trait-id"}, description = "The body web service geneTraitId parameter", required = false, arity = 1) + public String geneTraitId; + + @Parameter(names = {"--gene-trait-name"}, description = "The body web service geneTraitName parameter", required = false, arity = 1) + public String geneTraitName; + + @Parameter(names = {"--trait"}, description = "The body web service trait parameter", required = false, arity = 1) + public String trait; + + @Parameter(names = {"--cosmic"}, description = "The body web service cosmic parameter", required = false, arity = 1) + public String cosmic; + + @Parameter(names = {"--clinvar"}, description = "The body web service clinvar parameter", required = false, arity = 1) + public String clinvar; + + @Parameter(names = {"--hpo"}, description = "The body web service hpo parameter", required = false, arity = 1) + public String hpo; + + @Parameter(names = {"--go"}, description = "The body web service go parameter", required = false, arity = 1) + public String go; + + @Parameter(names = {"--expression"}, description = "The body web service expression parameter", required = false, arity = 1) + public String expression; + + @Parameter(names = {"--protein-keyword"}, description = "The body web service proteinKeyword parameter", required = false, arity = 1) + public String proteinKeyword; + + @Parameter(names = {"--drug"}, description = "The body web service drug parameter", required = false, arity = 1) + public String drug; + + @Parameter(names = {"--custom-annotation"}, description = "The body web service customAnnotation parameter", required = false, arity = 1) + public String customAnnotation; + + @Parameter(names = {"--unknown-genotype"}, description = "The body web service unknownGenotype parameter", required = false, arity = 1) + public String unknownGenotype; + + @Parameter(names = {"--sample-metadata"}, description = "The body web service sampleMetadata parameter", required = false, help = true, arity = 0) + public boolean sampleMetadata = false; + + @Parameter(names = {"--sort"}, description = "The body web service sort parameter", required = false, help = true, arity = 0) + public boolean sort = false; + + @Parameter(names = {"--outdir"}, description = "The body web service outdir parameter", required = false, arity = 1) + public String outdir; + + @Parameter(names = {"--output-file-name"}, description = "The body web service outputFileName parameter", required = false, arity = 1) + public String outputFileName; + + @Parameter(names = {"--file-format"}, description = "The body web service fileFormat parameter", required = false, arity = 1) + public String fileFormat; + + @Parameter(names = {"--docker-image"}, description = "The body web service dockerImage parameter", required = false, arity = 1) + public String dockerImage; + + @Parameter(names = {"--command-line"}, description = "The body web service commandLine parameter", required = false, arity = 1) + public String commandLine; + + @Parameter(names = {"--body_include"}, description = "The body web service include parameter", required = false, arity = 1) + public String bodyInclude; + + @Parameter(names = {"--body_exclude"}, description = "The body web service exclude parameter", required = false, arity = 1) + public String bodyExclude; + + } + } \ No newline at end of file diff --git a/opencga-client/src/main/R/R/Variant-methods.R b/opencga-client/src/main/R/R/Variant-methods.R index 5413a7604b5..b6979ac0c5e 100644 --- a/opencga-client/src/main/R/R/Variant-methods.R +++ b/opencga-client/src/main/R/R/Variant-methods.R @@ -58,6 +58,7 @@ #' | runSampleStats | /{apiVersion}/analysis/variant/sample/stats/run | study, jobId, jobDescription, jobDependsOn, jobTags, jobScheduledStartTime, jobPriority, jobDryRun, body[*] | #' | runStatsExport | /{apiVersion}/analysis/variant/stats/export/run | project, study, jobId, jobDescription, jobDependsOn, jobTags, jobScheduledStartTime, jobPriority, jobDryRun, body[*] | #' | runStats | /{apiVersion}/analysis/variant/stats/run | study, jobId, jobDescription, jobDependsOn, jobTags, jobScheduledStartTime, jobPriority, jobDryRun, body[*] | +#' | runWalker | /{apiVersion}/analysis/variant/walker/run | include, exclude, project, study, jobId, jobDescription, jobDependsOn, jobTags, jobScheduledStartTime, jobPriority, jobDryRun, body[*] | #' #' @md #' @seealso \url{http://docs.opencb.org/display/opencga/Using+OpenCGA} and the RESTful API documentation @@ -718,5 +719,22 @@ setMethod("variantClient", "OpencgaR", function(OpencgaR, endpointName, params=N #' @param data Variant stats params. runStats=fetchOpenCGA(object=OpencgaR, category="analysis", categoryId=NULL, subcategory="variant/stats", subcategoryId=NULL, action="run", params=params, httpMethod="POST", as.queryParam=NULL, ...), + + #' @section Endpoint /{apiVersion}/analysis/variant/walker/run: + #' Filter and walk variants from the variant storage to produce a file. + #' @param include Fields included in the response, whole JSON path must be provided. + #' @param exclude Fields excluded in the response, whole JSON path must be provided. + #' @param project Project [organization@]project where project can be either the ID or the alias. + #' @param study Study [[organization@]project:]study where study and project can be either the ID or UUID. + #' @param jobId Job ID. It must be a unique string within the study. An ID will be autogenerated automatically if not provided. + #' @param jobDescription Job description. + #' @param jobDependsOn Comma separated list of existing job IDs the job will depend on. + #' @param jobTags Job tags. + #' @param jobScheduledStartTime Time when the job is scheduled to start. + #' @param jobPriority Priority of the job. + #' @param jobDryRun Flag indicating that the job will be executed in dry-run mode. In this mode, OpenCGA will validate that all parameters and prerequisites are correctly set for successful execution, but the job will not actually run. + #' @param data Variant walker params. + runWalker=fetchOpenCGA(object=OpencgaR, category="analysis", categoryId=NULL, subcategory="variant/walker", + subcategoryId=NULL, action="run", params=params, httpMethod="POST", as.queryParam=NULL, ...), ) }) \ No newline at end of file diff --git a/opencga-client/src/main/java/org/opencb/opencga/client/rest/clients/VariantClient.java b/opencga-client/src/main/java/org/opencb/opencga/client/rest/clients/VariantClient.java index 6a68ae8ea82..04b6aa2da4d 100644 --- a/opencga-client/src/main/java/org/opencb/opencga/client/rest/clients/VariantClient.java +++ b/opencga-client/src/main/java/org/opencb/opencga/client/rest/clients/VariantClient.java @@ -55,6 +55,7 @@ import org.opencb.opencga.core.models.variant.SampleVariantStatsAnalysisParams; import org.opencb.opencga.core.models.variant.VariantExportParams; import org.opencb.opencga.core.models.variant.VariantStatsAnalysisParams; +import org.opencb.opencga.core.models.variant.VariantWalkerParams; import org.opencb.opencga.core.response.RestResponse; @@ -1101,4 +1102,29 @@ public RestResponse runStats(VariantStatsAnalysisParams data, ObjectMap par params.put("body", data); return execute("analysis", null, "variant/stats", null, "run", params, POST, Job.class); } + + /** + * Filter and walk variants from the variant storage to produce a file. + * @param data Variant walker params. + * @param params Map containing any of the following optional parameters. + * include: Fields included in the response, whole JSON path must be provided. + * exclude: Fields excluded in the response, whole JSON path must be provided. + * project: Project [organization@]project where project can be either the ID or the alias. + * study: Study [[organization@]project:]study where study and project can be either the ID or UUID. + * jobId: Job ID. It must be a unique string within the study. An ID will be autogenerated automatically if not provided. + * jobDescription: Job description. + * jobDependsOn: Comma separated list of existing job IDs the job will depend on. + * jobTags: Job tags. + * jobScheduledStartTime: Time when the job is scheduled to start. + * jobPriority: Priority of the job. + * jobDryRun: Flag indicating that the job will be executed in dry-run mode. In this mode, OpenCGA will validate that all + * parameters and prerequisites are correctly set for successful execution, but the job will not actually run. + * @return a RestResponse object. + * @throws ClientException ClientException if there is any server error. + */ + public RestResponse runWalker(VariantWalkerParams data, ObjectMap params) throws ClientException { + params = params != null ? params : new ObjectMap(); + params.put("body", data); + return execute("analysis", null, "variant/walker", null, "run", params, POST, Job.class); + } } diff --git a/opencga-client/src/main/javascript/Variant.js b/opencga-client/src/main/javascript/Variant.js index 7d8b01966da..910b01cc8c1 100644 --- a/opencga-client/src/main/javascript/Variant.js +++ b/opencga-client/src/main/javascript/Variant.js @@ -959,4 +959,26 @@ export default class Variant extends OpenCGAParentClass { return this._post("analysis", null, "variant/stats", null, "run", data, params); } + /** Filter and walk variants from the variant storage to produce a file + * @param {Object} data - Variant walker params. + * @param {Object} [params] - The Object containing the following optional parameters: + * @param {String} [params.include] - Fields included in the response, whole JSON path must be provided. + * @param {String} [params.exclude] - Fields excluded in the response, whole JSON path must be provided. + * @param {String} [params.project] - Project [organization@]project where project can be either the ID or the alias. + * @param {String} [params.study] - Study [[organization@]project:]study where study and project can be either the ID or UUID. + * @param {String} [params.jobId] - Job ID. It must be a unique string within the study. An ID will be autogenerated automatically if not + * provided. + * @param {String} [params.jobDescription] - Job description. + * @param {String} [params.jobDependsOn] - Comma separated list of existing job IDs the job will depend on. + * @param {String} [params.jobTags] - Job tags. + * @param {String} [params.jobScheduledStartTime] - Time when the job is scheduled to start. + * @param {String} [params.jobPriority] - Priority of the job. + * @param {Boolean} [params.jobDryRun] - Flag indicating that the job will be executed in dry-run mode. In this mode, OpenCGA will + * validate that all parameters and prerequisites are correctly set for successful execution, but the job will not actually run. + * @returns {Promise} Promise object in the form of RestResponse instance. + */ + runWalker(data, params) { + return this._post("analysis", null, "variant/walker", null, "run", data, params); + } + } \ No newline at end of file diff --git a/opencga-client/src/main/python/pyopencga/rest_clients/variant_client.py b/opencga-client/src/main/python/pyopencga/rest_clients/variant_client.py index 3993f48ba22..166e14137a2 100644 --- a/opencga-client/src/main/python/pyopencga/rest_clients/variant_client.py +++ b/opencga-client/src/main/python/pyopencga/rest_clients/variant_client.py @@ -1312,3 +1312,34 @@ def run_stats(self, data=None, **options): return self._post(category='analysis', resource='run', subcategory='variant/stats', data=data, **options) + def run_walker(self, data=None, **options): + """ + Filter and walk variants from the variant storage to produce a file. + PATH: /{apiVersion}/analysis/variant/walker/run + + :param dict data: Variant walker params. (REQUIRED) + :param str include: Fields included in the response, whole JSON path + must be provided. + :param str exclude: Fields excluded in the response, whole JSON path + must be provided. + :param str project: Project [organization@]project where project can + be either the ID or the alias. + :param str study: Study [[organization@]project:]study where study and + project can be either the ID or UUID. + :param str job_id: Job ID. It must be a unique string within the + study. An ID will be autogenerated automatically if not provided. + :param str job_description: Job description. + :param str job_depends_on: Comma separated list of existing job IDs + the job will depend on. + :param str job_tags: Job tags. + :param str job_scheduled_start_time: Time when the job is scheduled to + start. + :param str job_priority: Priority of the job. + :param bool job_dry_run: Flag indicating that the job will be executed + in dry-run mode. In this mode, OpenCGA will validate that all + parameters and prerequisites are correctly set for successful + execution, but the job will not actually run. + """ + + return self._post(category='analysis', resource='run', subcategory='variant/walker', data=data, **options) + diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/cellbase/CellBaseValidator.java b/opencga-core/src/main/java/org/opencb/opencga/core/cellbase/CellBaseValidator.java index e6b7f7353c8..88c06062bcb 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/cellbase/CellBaseValidator.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/cellbase/CellBaseValidator.java @@ -12,13 +12,11 @@ import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.utils.VersionUtils; - import org.opencb.opencga.core.config.storage.CellBaseConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; - import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; @@ -148,13 +146,13 @@ public void validate() throws IOException { private CellBaseConfiguration validate(boolean autoComplete) throws IOException { CellBaseConfiguration cellBaseConfiguration = getCellBaseConfiguration(); String inputVersion = getVersion(); - CellBaseDataResponse species; + SpeciesProperties species; try { species = retryMetaSpecies(); } catch (RuntimeException e) { throw new IllegalArgumentException("Unable to access cellbase url '" + getURL() + "', version '" + inputVersion + "'", e); } - if (species == null || species.firstResult() == null) { + if (species == null) { if (autoComplete && !cellBaseConfiguration.getVersion().startsWith("v")) { // Version might be missing the starting "v" cellBaseConfiguration.setVersion("v" + cellBaseConfiguration.getVersion()); @@ -162,10 +160,10 @@ private CellBaseConfiguration validate(boolean autoComplete) throws IOException species = retryMetaSpecies(); } } - if (species == null || species.firstResult() == null) { + if (species == null) { throw new IllegalArgumentException("Unable to access cellbase url '" + getURL() + "', version '" + inputVersion + "'"); } - validateSpeciesAssembly(species.firstResult()); + validateSpeciesAssembly(species); String serverVersion = getVersionFromServer(); if (!supportsDataRelease(serverVersion)) { @@ -324,14 +322,18 @@ public String getVersionFromServer() throws IOException { } private ObjectMap retryMetaAbout() throws IOException { - return retry(3, () -> cellBaseClient.getMetaClient().about().firstResult()); + return retry("meta/about", () -> cellBaseClient.getMetaClient().about().firstResult()); } - private CellBaseDataResponse retryMetaSpecies() throws IOException { - return retry(3, () -> cellBaseClient.getMetaClient().species()); + private SpeciesProperties retryMetaSpecies() throws IOException { + return retry("meta/species", () -> cellBaseClient.getMetaClient().species().firstResult()); } - private T retry(int retries, Callable function) throws IOException { + private T retry(String name, Callable function) throws IOException { + return retry(name, function, 3); + } + + private T retry(String name, Callable function, int retries) throws IOException { if (retries <= 0) { return null; } @@ -345,8 +347,8 @@ private T retry(int retries, Callable function) throws IOException { if (result == null) { try { // Retry - logger.warn("Unable to get reach cellbase " + toString() + ". Retrying..."); - result = retry(retries - 1, function); + logger.warn("Unable to get '{}' from cellbase " + toString() + ". Retrying...", name); + result = retry(name, function, retries - 1); } catch (Exception e1) { if (e == null) { e = e1; @@ -359,7 +361,6 @@ private T retry(int retries, Callable function) throws IOException { throw new IOException("Error reading from cellbase " + toString(), e); } } - } return result; } diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/config/ConfigurationOption.java b/opencga-core/src/main/java/org/opencb/opencga/core/config/ConfigurationOption.java index 20024e91520..8101aa8c2e3 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/config/ConfigurationOption.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/config/ConfigurationOption.java @@ -6,6 +6,10 @@ public interface ConfigurationOption { T defaultValue(); + default boolean isProtected() { + return false; + } + // default boolean isFinal() { // return false; // } diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantWalkerParams.java b/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantWalkerParams.java new file mode 100644 index 00000000000..ef541690fc9 --- /dev/null +++ b/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantWalkerParams.java @@ -0,0 +1,75 @@ +package org.opencb.opencga.core.models.variant; + +public class VariantWalkerParams extends VariantQueryParams { + public static final String DESCRIPTION = "Variant walker params"; + private String outdir; + private String outputFileName; + private String fileFormat; + private String dockerImage; + private String commandLine; + private String include; + private String exclude; + + public String getOutdir() { + return outdir; + } + + public VariantWalkerParams setOutdir(String outdir) { + this.outdir = outdir; + return this; + } + + public String getOutputFileName() { + return outputFileName; + } + + public VariantWalkerParams setOutputFileName(String outputFileName) { + this.outputFileName = outputFileName; + return this; + } + + public String getFileFormat() { + return fileFormat; + } + + public VariantWalkerParams setFileFormat(String fileFormat) { + this.fileFormat = fileFormat; + return this; + } + + public String getDockerImage() { + return dockerImage; + } + + public VariantWalkerParams setDockerImage(String dockerImage) { + this.dockerImage = dockerImage; + return this; + } + + public String getCommandLine() { + return commandLine; + } + + public VariantWalkerParams setCommandLine(String commandLine) { + this.commandLine = commandLine; + return this; + } + + public String getInclude() { + return include; + } + + public VariantWalkerParams setInclude(String include) { + this.include = include; + return this; + } + + public String getExclude() { + return exclude; + } + + public VariantWalkerParams setExclude(String exclude) { + this.exclude = exclude; + return this; + } +} diff --git a/opencga-server/src/main/java/org/opencb/opencga/server/rest/analysis/AnalysisWebService.java b/opencga-server/src/main/java/org/opencb/opencga/server/rest/analysis/AnalysisWebService.java index 7fa1ebe6dd0..dc19bbe85bd 100644 --- a/opencga-server/src/main/java/org/opencb/opencga/server/rest/analysis/AnalysisWebService.java +++ b/opencga-server/src/main/java/org/opencb/opencga/server/rest/analysis/AnalysisWebService.java @@ -16,7 +16,6 @@ package org.opencb.opencga.server.rest.analysis; -import org.opencb.opencga.catalog.managers.JobManager; import org.opencb.opencga.core.exceptions.VersionException; import org.opencb.opencga.server.rest.OpenCGAWSServer; @@ -31,8 +30,6 @@ */ public class AnalysisWebService extends OpenCGAWSServer { - protected JobManager jobManager; - public AnalysisWebService(@Context UriInfo uriInfo, @Context HttpServletRequest httpServletRequest, @Context HttpHeaders httpHeaders) throws IOException, VersionException { this(uriInfo.getPathParameters().getFirst("apiVersion"), uriInfo, httpServletRequest, httpHeaders); @@ -41,8 +38,6 @@ public AnalysisWebService(@Context UriInfo uriInfo, @Context HttpServletRequest public AnalysisWebService(String apiVersion, @Context UriInfo uriInfo, @Context HttpServletRequest httpServletRequest, @Context HttpHeaders httpHeaders) throws IOException, VersionException { super(apiVersion, uriInfo, httpServletRequest, httpHeaders); - - this.jobManager = catalogManager.getJobManager(); } } diff --git a/opencga-server/src/main/java/org/opencb/opencga/server/rest/analysis/VariantWebService.java b/opencga-server/src/main/java/org/opencb/opencga/server/rest/analysis/VariantWebService.java index 70748fffb0c..5c824ac21c4 100644 --- a/opencga-server/src/main/java/org/opencb/opencga/server/rest/analysis/VariantWebService.java +++ b/opencga-server/src/main/java/org/opencb/opencga/server/rest/analysis/VariantWebService.java @@ -31,6 +31,7 @@ import org.opencb.opencga.analysis.individual.qc.IndividualQcAnalysis; import org.opencb.opencga.analysis.sample.qc.SampleQcAnalysis; import org.opencb.opencga.analysis.variant.VariantExportTool; +import org.opencb.opencga.analysis.variant.VariantWalkerTool; import org.opencb.opencga.analysis.variant.circos.CircosAnalysis; import org.opencb.opencga.analysis.variant.circos.CircosLocalAnalysisExecutor; import org.opencb.opencga.analysis.variant.genomePlot.GenomePlotAnalysis; @@ -411,6 +412,27 @@ public Response export( return submitJob(VariantExportTool.ID, project, study, params, jobName, jobDescription, dependsOn, jobTags, scheduledStartTime, jobPriority, dryRun); } + @POST + @Path("/walker/run") + @ApiOperation(value = VariantWalkerTool.DESCRIPTION, response = Job.class) + @ApiImplicitParams({ + @ApiImplicitParam(name = QueryOptions.INCLUDE, value = ParamConstants.INCLUDE_DESCRIPTION, example = "name,attributes", dataType = "string", paramType = "query"), + @ApiImplicitParam(name = QueryOptions.EXCLUDE, value = ParamConstants.EXCLUDE_DESCRIPTION, example = "id,status", dataType = "string", paramType = "query"), + }) + public Response walker( + @ApiParam(value = ParamConstants.PROJECT_DESCRIPTION) @QueryParam(ParamConstants.PROJECT_PARAM) String project, + @ApiParam(value = ParamConstants.STUDY_DESCRIPTION) @QueryParam(ParamConstants.STUDY_PARAM) String study, + @ApiParam(value = ParamConstants.JOB_ID_CREATION_DESCRIPTION) @QueryParam(ParamConstants.JOB_ID) String jobName, + @ApiParam(value = ParamConstants.JOB_DESCRIPTION_DESCRIPTION) @QueryParam(ParamConstants.JOB_DESCRIPTION) String jobDescription, + @ApiParam(value = ParamConstants.JOB_DEPENDS_ON_DESCRIPTION) @QueryParam(JOB_DEPENDS_ON) String dependsOn, + @ApiParam(value = ParamConstants.JOB_TAGS_DESCRIPTION) @QueryParam(ParamConstants.JOB_TAGS) String jobTags, + @ApiParam(value = ParamConstants.JOB_SCHEDULED_START_TIME_DESCRIPTION) @QueryParam(ParamConstants.JOB_SCHEDULED_START_TIME) String scheduledStartTime, + @ApiParam(value = ParamConstants.JOB_PRIORITY_DESCRIPTION) @QueryParam(ParamConstants.SUBMIT_JOB_PRIORITY_PARAM) String jobPriority, + @ApiParam(value = ParamConstants.JOB_DRY_RUN_DESCRIPTION) @QueryParam(ParamConstants.JOB_DRY_RUN) Boolean dryRun, + @ApiParam(value = VariantWalkerParams.DESCRIPTION, required = true) VariantWalkerParams params) { + return submitJob(VariantWalkerTool.ID, project, study, params, jobName, jobDescription, dependsOn, jobTags, scheduledStartTime, jobPriority, dryRun); + } + @GET @Path("/annotation/query") @ApiOperation(value = "Query variant annotations from any saved versions", response = VariantAnnotation.class) diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java index 77327d9d76a..bf46887740f 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java @@ -34,7 +34,6 @@ import org.opencb.opencga.core.models.operations.variant.VariantAggregateFamilyParams; import org.opencb.opencga.core.models.operations.variant.VariantAggregateParams; import org.opencb.opencga.core.models.variant.VariantSetupParams; -import org.opencb.opencga.storage.core.variant.query.VariantQueryResult; import org.opencb.opencga.storage.core.StorageEngine; import org.opencb.opencga.storage.core.StoragePipelineResult; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; @@ -57,9 +56,11 @@ import org.opencb.opencga.storage.core.variant.io.VariantExporter; import org.opencb.opencga.storage.core.variant.io.VariantImporter; import org.opencb.opencga.storage.core.variant.io.VariantReaderUtils; +import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat; import org.opencb.opencga.storage.core.variant.query.ParsedVariantQuery; import org.opencb.opencga.storage.core.variant.query.VariantQueryParser; +import org.opencb.opencga.storage.core.variant.query.VariantQueryResult; import org.opencb.opencga.storage.core.variant.query.VariantQueryUtils; import org.opencb.opencga.storage.core.variant.query.executors.*; import org.opencb.opencga.storage.core.variant.score.VariantScoreFormatDescriptor; @@ -284,6 +285,48 @@ public List exportData(URI outputFile, VariantOutputFormat outputFormat, UR return exporter.export(outputFile, outputFormat, variantsFile, parsedVariantQuery); } + public List walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, Query query, QueryOptions queryOptions, + String dockerImage, String commandLine) + throws IOException, StorageEngineException { + if (format == VariantWriterFactory.VariantOutputFormat.VCF || format == VariantWriterFactory.VariantOutputFormat.VCF_GZ) { + if (!isValidParam(query, VariantQueryParam.UNKNOWN_GENOTYPE)) { + query.put(VariantQueryParam.UNKNOWN_GENOTYPE.key(), "./."); + } + } + commandLine = commandLine.replace("'", "'\"'\"'"); + + String memory = getOptions().getString(WALKER_DOCKER_MEMORY.key(), WALKER_DOCKER_MEMORY.defaultValue()); + String cpu = getOptions().getString(WALKER_DOCKER_CPU.key(), WALKER_DOCKER_CPU.defaultValue()); + String user = getOptions().getString(WALKER_DOCKER_USER.key(), WALKER_DOCKER_USER.defaultValue()); + String envs = getOptions().getString(WALKER_DOCKER_ENV.key(), WALKER_DOCKER_ENV.defaultValue()); + String volume = getOptions().getString(WALKER_DOCKER_MOUNT.key(), WALKER_DOCKER_MOUNT.defaultValue()); + String opts = getOptions().getString(WALKER_DOCKER_OPTS.key(), WALKER_DOCKER_OPTS.defaultValue()); + + String dockerCommandLine = "docker run --rm -i " + + "--memory " + memory + " " + + "--cpus " + cpu + " " + + "--user " + user + " "; + + if (StringUtils.isNotEmpty(volume)) { + dockerCommandLine += "-v " + volume + ":/data "; + } + + if (StringUtils.isNotEmpty(envs)) { + for (String s : envs.split(",")) { + dockerCommandLine += "--env " + s + " "; + } + } + dockerCommandLine = dockerCommandLine + + opts + + dockerImage + " bash -ce '" + commandLine + "'"; + return walkData(outputFile, format, query, queryOptions, dockerCommandLine); + } + + + public abstract List walkData(URI outputFile, VariantOutputFormat format, Query query, QueryOptions queryOptions, + String commandLine) + throws StorageEngineException; + /** * Creates a new {@link VariantExporter} for the current backend. * The default implementation iterates locally through the database. diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java index 73abcbae61f..f7736bd5b1f 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java @@ -100,6 +100,13 @@ public enum VariantStorageOptions implements ConfigurationOption { QUERY_SAMPLE_LIMIT_DEFAULT("query.sample.limit.default", 100), QUERY_SAMPLE_LIMIT_MAX("query.sample.limit.max", 1000), + WALKER_DOCKER_MEMORY("walker.docker.memory", "512m", true), + WALKER_DOCKER_CPU("walker.docker.cpu", "1", true), + WALKER_DOCKER_USER("walker.docker.user", "root", true), + WALKER_DOCKER_ENV("walker.docker.env", "", true), + WALKER_DOCKER_MOUNT("walker.docker.mount", "", true), + WALKER_DOCKER_OPTS("walker.docker.opts", "", true), + // Search intersect options INTERSECT_ACTIVE("search.intersect.active", true), // Allow intersect queries with the SearchEngine (Solr) INTERSECT_ALWAYS("search.intersect.always", false), // Force intersect queries @@ -133,15 +140,24 @@ public enum VariantStorageOptions implements ConfigurationOption { private final String key; private final Object value; + private final boolean isProtected; VariantStorageOptions(String key) { this.key = key; this.value = null; + this.isProtected = false; } VariantStorageOptions(String key, Object value) { this.key = key; this.value = value; + this.isProtected = false; + } + + VariantStorageOptions(String key, Object value, boolean isProtected) { + this.key = key; + this.value = value; + this.isProtected = isProtected; } public String key() { @@ -153,4 +169,10 @@ public T defaultValue() { return (T) value; } + @Override + public boolean isProtected() { + return isProtected; + } + + } diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/VariantWriterFactory.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/VariantWriterFactory.java index 509207e5be5..fa002facbd5 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/VariantWriterFactory.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/VariantWriterFactory.java @@ -122,6 +122,14 @@ public boolean isSnappy() { return extension.endsWith(".snappy"); } + public VariantOutputFormat inPlan() { + if (!isPlain()) { + return VariantOutputFormat.valueOf(name().replace("_GZ", "").replace("_SNAPPY", "")); + } else { + return this; + } + } + public VariantOutputFormat withGzip() { try { if (isGzip()) { diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java index 55903d221f2..e10370dcaaf 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java @@ -19,6 +19,8 @@ import org.opencb.biodata.models.variant.metadata.VariantMetadata; import org.opencb.commons.datastore.core.DataResult; import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.commons.datastore.core.Query; +import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.opencga.core.config.DatabaseCredentials; import org.opencb.opencga.core.config.storage.StorageConfiguration; import org.opencb.opencga.core.config.storage.StorageEngineConfiguration; @@ -32,6 +34,7 @@ import org.opencb.opencga.storage.core.variant.VariantStorageEngine; import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor; import org.opencb.opencga.storage.core.variant.io.VariantImporter; +import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.core.variant.score.VariantScoreFormatDescriptor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -139,6 +142,11 @@ public void importData(URI input, VariantMetadata metadata, List walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, Query query, QueryOptions queryOptions, String commandLine) throws StorageEngineException { + throw new UnsupportedOperationException("Unable to walk data in " + getStorageEngineId()); + } + @Override public void removeFiles(String study, List files, URI outdir) throws StorageEngineException { TaskMetadata task = preRemove(study, files, Collections.emptyList()); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index 49fdbf02237..4e9fe7057a8 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -29,10 +29,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.io.*; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Paths; @@ -41,6 +38,8 @@ import java.util.List; import java.util.Map; import java.util.function.Supplier; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; import static org.opencb.opencga.core.common.IOUtils.humanReadableByteCount; import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageOptions.MR_EXECUTOR_SSH_PASSWORD; @@ -469,16 +468,28 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool LOGGER.info(" Source : " + mrOutdir.toUri()); LOGGER.info(" Target : " + localOutput.toUri()); LOGGER.info(" ---- "); - try (FSDataOutputStream os = localOutput.getFileSystem(getConf()).create(localOutput)) { + try (FSDataOutputStream fsOs = localOutput.getFileSystem(getConf()).create(localOutput)) { + boolean isGzip = paths.get(0).getName().endsWith(".gz"); + OutputStream os; + if (isGzip) { + os = new GZIPOutputStream(fsOs); + } else { + os = fsOs; + } for (int i = 0; i < paths.size(); i++) { Path path = paths.get(i); LOGGER.info("Concat file : '{}' {} ", path.toUri(), humanReadableByteCount(fileSystem.getFileStatus(path).getLen(), false)); try (FSDataInputStream fsIs = fileSystem.open(path)) { - BufferedReader br; - br = new BufferedReader(new InputStreamReader(fsIs)); InputStream is; + if (isGzip) { + is = new GZIPInputStream(fsIs); + } else { + is = fsIs; + } + // Remove extra headers from all files but the first if (removeExtraHeaders && i != 0) { + BufferedReader br = new BufferedReader(new InputStreamReader(is)); String line; do { br.mark(10 * 1024 * 1024); //10MB @@ -486,8 +497,6 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool } while (line != null && line.startsWith("#")); br.reset(); is = new ReaderInputStream(br, Charset.defaultCharset()); - } else { - is = fsIs; } IOUtils.copyBytes(is, os, getConf(), false); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/ValueOnlyTextOutputFormat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/ValueOnlyTextOutputFormat.java new file mode 100644 index 00000000000..9d1759cf73f --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/ValueOnlyTextOutputFormat.java @@ -0,0 +1,33 @@ +package org.opencb.opencga.storage.hadoop.utils; + +import org.apache.hadoop.mapreduce.RecordWriter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; + +import java.io.IOException; + +public class ValueOnlyTextOutputFormat extends TextOutputFormat { + + @Override + public RecordWriter getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { + return new ValueOnlyRecordWriter(super.getRecordWriter(job)); + } + + private class ValueOnlyRecordWriter extends RecordWriter { + private final RecordWriter recordWriter; + + ValueOnlyRecordWriter(RecordWriter recordWriter) { + this.recordWriter = recordWriter; + } + + @Override + public void write(K key, V value) throws IOException, InterruptedException { + recordWriter.write(null, value); + } + + @Override + public void close(TaskAttemptContext context) throws IOException, InterruptedException { + recordWriter.close(context); + } + } +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java index 84f00b042e9..fdee34d3133 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java @@ -59,6 +59,7 @@ import org.opencb.opencga.storage.core.variant.annotation.VariantAnnotationManager; import org.opencb.opencga.storage.core.variant.annotation.annotators.VariantAnnotator; import org.opencb.opencga.storage.core.variant.io.VariantExporter; +import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.core.variant.query.ParsedVariantQuery; import org.opencb.opencga.storage.core.variant.query.VariantQueryParser; import org.opencb.opencga.storage.core.variant.query.executors.*; @@ -94,6 +95,7 @@ import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexDeleteHBaseColumnTask; import org.opencb.opencga.storage.hadoop.variant.io.HadoopVariantExporter; +import org.opencb.opencga.storage.hadoop.variant.mr.StreamVariantDriver; import org.opencb.opencga.storage.hadoop.variant.prune.VariantPruneManager; import org.opencb.opencga.storage.hadoop.variant.score.HadoopVariantScoreLoader; import org.opencb.opencga.storage.hadoop.variant.score.HadoopVariantScoreRemover; @@ -314,6 +316,23 @@ protected VariantExporter newVariantExporter(VariantMetadataFactory metadataFact return new HadoopVariantExporter(this, metadataFactory, getMRExecutor(), ioConnectorProvider); } + @Override + public List walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, + Query query, QueryOptions queryOptions, String commandLine) throws StorageEngineException { + ParsedVariantQuery variantQuery = parseQuery(query, queryOptions); + int studyId = variantQuery.getStudyQuery().getDefaultStudy().getId(); + getMRExecutor().run(StreamVariantDriver.class, StreamVariantDriver.buildArgs( + null, + getVariantTableName(), studyId, null, + new ObjectMap().appendAll(variantQuery.getQuery()).appendAll(variantQuery.getInputOptions()) + .append(StreamVariantDriver.MAX_BYTES_PER_MAP_PARAM, 1024 * 10) + .append(StreamVariantDriver.COMMAND_LINE_BASE64_PARAM, Base64.getEncoder().encodeToString(commandLine.getBytes())) + .append(StreamVariantDriver.INPUT_FORMAT_PARAM, format.toString()) + .append(StreamVariantDriver.OUTPUT_PARAM, outputFile) + ), ""); + return null; + } + @Override public void deleteStats(String study, Collection cohorts, ObjectMap params) throws StorageEngineException { ObjectMap options = getMergedOptions(params); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java new file mode 100644 index 00000000000..755275263a7 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java @@ -0,0 +1,177 @@ +package org.opencb.opencga.storage.hadoop.variant.io; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.opencb.commons.datastore.core.Query; +import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.opencga.storage.core.exceptions.StorageEngineException; +import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; +import org.opencb.opencga.storage.core.variant.query.ParsedVariantQuery; +import org.opencb.opencga.storage.core.variant.query.VariantQueryParser; +import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver; +import org.opencb.opencga.storage.hadoop.variant.HadoopVariantQueryParser; +import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHBaseQueryParser; +import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantSqlQueryParser; +import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexDBAdaptor; +import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexQueryParser; +import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil; +import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.List; +import java.util.Map; + +import static org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil.getQueryFromConfig; +import static org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil.getQueryOptionsFromConfig; + +/** + * Created on 14/06/18. + * + * export HADOOP_USER_CLASSPATH_FIRST=true + * hbase_conf=$(hbase classpath | tr ":" "\n" | grep "/conf" | tr "\n" ":") + * export HADOOP_CLASSPATH=${hbase_conf}:$PWD/libs/avro-1.7.7.jar:$PWD/libs/jackson-databind-2.6.6.jar:$PWD/libs/jackson-core-2.6.6.jar + * export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:$PWD/libs/jackson-annotations-2.6.6.jar + * yarn jar opencga-storage-hadoop-core-1.4.0-jar-with-dependencies.jar \ + * org.opencb.opencga.storage.hadoop.variant.io.VariantExporterDriver \ + * opencga_variants study myStudy --of avro --output my.variants.avro --region 22 + * + * @author Jacobo Coll <jacobo167@gmail.com> + */ +public abstract class VariantDriver extends AbstractVariantsTableDriver { + + public static final String OUTPUT_PARAM = "output"; + public static final String CONCAT_OUTPUT_PARAM = "concat-output"; + private Path outdir; + private Path localOutput; + private Query query = new Query(); + private QueryOptions options = new QueryOptions(); + private static Logger logger = LoggerFactory.getLogger(VariantDriver.class); + protected boolean useReduceStep; + + @Override + protected void parseAndValidateParameters() throws IOException { + setStudyId(-1); + super.parseAndValidateParameters(); + String outdirStr = getParam(OUTPUT_PARAM); + if (StringUtils.isEmpty(outdirStr)) { + throw new IllegalArgumentException("Missing argument " + OUTPUT_PARAM); + } + + useReduceStep = Boolean.valueOf(getParam(CONCAT_OUTPUT_PARAM)); + outdir = new Path(outdirStr); + if (isLocal(outdir)) { + localOutput = getLocalOutput(outdir); + outdir = getTempOutdir("opencga_export", localOutput.getName()); + outdir.getFileSystem(getConf()).deleteOnExit(outdir); + } + if (localOutput != null) { + useReduceStep = true; + logger.info(" * Outdir file: " + localOutput.toUri()); + logger.info(" * Temporary outdir file: " + outdir.toUri()); + } else { + logger.info(" * Outdir file: " + outdir.toUri()); + } + + getQueryFromConfig(query, getConf()); + getQueryOptionsFromConfig(options, getConf()); + + logger.info(" * Query:"); + for (Map.Entry entry : query.entrySet()) { + logger.info(" * " + entry.getKey() + " : " + entry.getValue()); + } + } + + @Override + protected abstract Class getMapperClass(); + + protected abstract Class getReducerClass(); + + protected abstract Class getOutputFormatClass(); + + protected abstract void setupJob(Job job) throws IOException; + + @Override + protected final Job setupJob(Job job, String archiveTable, String variantTable) throws IOException { + setupJob(job); + Class mapperClass = getMapperClass(); + Class reducerClass = getReducerClass(); + if (mapperClass == null) { + throw new IllegalArgumentException("Mapper class not provided!"); + } + if (useReduceStep) { + if (reducerClass == null) { + throw new IllegalArgumentException("Reducer class not provided!"); + } + } + Class outputFormatClass = getOutputFormatClass(); + if (outputFormatClass == null) { + throw new IllegalArgumentException("Output format class not provided!"); + } + job.setOutputFormatClass(outputFormatClass); + + if (useReduceStep) { + logger.info("Use one Reduce task to produce a single file"); + job.setReducerClass(reducerClass); + job.setNumReduceTasks(1); + } else { + VariantMapReduceUtil.setNoneReduce(job); + } + + VariantQueryParser variantQueryParser = new HadoopVariantQueryParser(null, getMetadataManager()); + ParsedVariantQuery variantQuery = variantQueryParser.parseQuery(query, options); + Query query = variantQuery.getQuery(); + if (VariantHBaseQueryParser.isSupportedQuery(query)) { + logger.info("Init MapReduce job reading from HBase"); + boolean useSampleIndex = !getConf().getBoolean("skipSampleIndex", false) && SampleIndexQueryParser.validSampleIndexQuery(query); + if (useSampleIndex) { + // Remove extra fields from the query + new SampleIndexDBAdaptor(getHBaseManager(), getTableNameGenerator(), getMetadataManager()).parseSampleIndexQuery(query); + + logger.info("Use sample index to read from HBase"); + } + + VariantHBaseQueryParser parser = new VariantHBaseQueryParser(getMetadataManager()); + List scans = parser.parseQueryMultiRegion(variantQuery, options); + VariantMapReduceUtil.configureMapReduceScans(scans, getConf()); + + VariantMapReduceUtil.initVariantMapperJobFromHBase(job, variantTable, scans, mapperClass, useSampleIndex); + } else { + logger.info("Init MapReduce job reading from Phoenix"); + String sql = new VariantSqlQueryParser(variantTable, getMetadataManager(), getHelper().getConf()) + .parse(variantQuery, options); + + VariantMapReduceUtil.initVariantMapperJobFromPhoenix(job, variantTable, sql, mapperClass); + } + + setNoneTimestamp(job); + + FileOutputFormat.setOutputPath(job, outdir); // set Path + + VariantMapReduceUtil.configureVariantConverter(job.getConfiguration(), false, true, true, + query.getString(VariantQueryParam.UNKNOWN_GENOTYPE.key(), "./.")); + + + return job; + } + + + @Override + protected void postExecution(boolean succeed) throws IOException, StorageEngineException { + super.postExecution(succeed); + if (succeed) { + if (localOutput != null) { + concatMrOutputToLocal(outdir, localOutput); + } + } + if (localOutput != null) { + deleteTemporaryFile(outdir); + } + } + +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java index aa342c10a92..c44e686e4d2 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java @@ -5,9 +5,6 @@ import org.apache.avro.mapred.AvroValue; import org.apache.avro.mapreduce.AvroJob; import org.apache.avro.mapreduce.AvroKeyOutputFormat; -import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.DeflateCodec; @@ -25,100 +22,50 @@ import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.GeneCancerAssociation; import org.opencb.biodata.models.variant.avro.VariantAvro; -import org.opencb.commons.datastore.core.Query; -import org.opencb.commons.datastore.core.QueryOptions; -import org.opencb.opencga.storage.core.exceptions.StorageEngineException; -import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; -import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat; -import org.opencb.opencga.storage.core.variant.query.ParsedVariantQuery; -import org.opencb.opencga.storage.core.variant.query.VariantQueryParser; +import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver; -import org.opencb.opencga.storage.hadoop.variant.HadoopVariantQueryParser; -import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHBaseQueryParser; -import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantSqlQueryParser; -import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexDBAdaptor; -import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexQueryParser; import org.opencb.opencga.storage.hadoop.variant.mr.VariantFileOutputFormat; -import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil; import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.List; -import java.util.Map; import java.util.Objects; import java.util.logging.Handler; import java.util.logging.Level; -import static org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil.getQueryFromConfig; -import static org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil.getQueryOptionsFromConfig; - -/** - * Created on 14/06/18. - * - * export HADOOP_USER_CLASSPATH_FIRST=true - * hbase_conf=$(hbase classpath | tr ":" "\n" | grep "/conf" | tr "\n" ":") - * export HADOOP_CLASSPATH=${hbase_conf}:$PWD/libs/avro-1.7.7.jar:$PWD/libs/jackson-databind-2.6.6.jar:$PWD/libs/jackson-core-2.6.6.jar - * export HADOOP_CLASSPATH=${HADOOP_CLASSPATH}:$PWD/libs/jackson-annotations-2.6.6.jar - * yarn jar opencga-storage-hadoop-core-1.4.0-jar-with-dependencies.jar \ - * org.opencb.opencga.storage.hadoop.variant.io.VariantExporterDriver \ - * opencga_variants study myStudy --of avro --output my.variants.avro --region 22 - * - * @author Jacobo Coll <jacobo167@gmail.com> - */ -public class VariantExporterDriver extends AbstractVariantsTableDriver { +public class VariantExporterDriver extends VariantDriver { public static final String OUTPUT_FORMAT_PARAM = "of"; - public static final String OUTPUT_PARAM = "output"; - public static final String CONCAT_OUTPUT_PARAM = "concat-output"; - private VariantOutputFormat outputFormat; - private Path outdir; - private Path localOutput; - private Query query = new Query(); - private QueryOptions options = new QueryOptions(); - private static Logger logger = LoggerFactory.getLogger(VariantExporterDriver.class); - private boolean useReduceStep; + private VariantWriterFactory.VariantOutputFormat outputFormat; + private Class mapperClass; + private Class reducerClass; + private Class outputFormatClass; @Override protected void parseAndValidateParameters() throws IOException { - setStudyId(-1); super.parseAndValidateParameters(); - outputFormat = VariantOutputFormat.valueOf(getParam(OUTPUT_FORMAT_PARAM, "avro").toUpperCase()); - String outdirStr = getParam(OUTPUT_PARAM); - if (StringUtils.isEmpty(outdirStr)) { - throw new IllegalArgumentException("Missing argument " + OUTPUT_PARAM); - } - useReduceStep = Boolean.valueOf(getParam(CONCAT_OUTPUT_PARAM)); - outdir = new Path(outdirStr); - if (isLocal(outdir)) { - localOutput = getLocalOutput(outdir); - outdir = getTempOutdir("opencga_export", localOutput.getName()); - outdir.getFileSystem(getConf()).deleteOnExit(outdir); - } - if (localOutput != null) { - useReduceStep = true; - logger.info(" * Outdir file: " + localOutput.toUri()); - logger.info(" * Temporary outdir file: " + outdir.toUri()); - } else { - logger.info(" * Outdir file: " + outdir.toUri()); - } + outputFormat = VariantWriterFactory.VariantOutputFormat.valueOf(getParam(OUTPUT_FORMAT_PARAM, "avro").toUpperCase()); + } - getQueryFromConfig(query, getConf()); - getQueryOptionsFromConfig(options, getConf()); + @Override + protected Class getMapperClass() { + return mapperClass; + } - logger.info(" * Query:"); - for (Map.Entry entry : query.entrySet()) { - logger.info(" * " + entry.getKey() + " : " + entry.getValue()); - } + @Override + protected Class getReducerClass() { + return reducerClass; } @Override - protected Job setupJob(Job job, String archiveTable, String variantTable) throws IOException { - Class mapperClass; - Class reducerClass; + protected Class getOutputFormatClass() { + return outputFormatClass; + } + + @Override + protected void setupJob(Job job) throws IOException { job.getConfiguration().setBoolean(JobContext.MAP_OUTPUT_COMPRESS, true); job.getConfiguration().setClass(JobContext.MAP_OUTPUT_COMPRESS_CODEC, DeflateCodec.class, CompressionCodec.class); switch (outputFormat) { @@ -127,7 +74,7 @@ protected Job setupJob(Job job, String archiveTable, String variantTable) throws job.getConfiguration().set(AvroJob.CONF_OUTPUT_CODEC, DataFileConstants.DEFLATE_CODEC); // do not break case AVRO: - job.setOutputFormatClass(AvroKeyOutputFormat.class); + outputFormatClass = AvroKeyOutputFormat.class; if (useReduceStep) { job.setMapOutputKeyClass(NullWritable.class); AvroJob.setMapOutputValueSchema(job, VariantAvro.getClassSchema()); @@ -148,7 +95,7 @@ protected Job setupJob(Job job, String archiveTable, String variantTable) throws ParquetOutputFormat.setCompression(job, CompressionCodecName.GZIP); // do not break case PARQUET: - job.setOutputFormatClass(AvroParquetOutputFormat.class); + outputFormatClass = AvroParquetOutputFormat.class; AvroParquetOutputFormat.setSchema(job, VariantAvro.getClassSchema()); if (useReduceStep) { job.setMapOutputKeyClass(NullWritable.class); @@ -176,69 +123,13 @@ protected Job setupJob(Job job, String archiveTable, String variantTable) throws } else if (outputFormat.isSnappy()) { FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); // compression } - job.setOutputFormatClass(VariantFileOutputFormat.class); + outputFormatClass = VariantFileOutputFormat.class; + job.getConfiguration().set(VariantFileOutputFormat.VARIANT_OUTPUT_FORMAT, outputFormat.name()); job.setOutputKeyClass(Variant.class); break; } - - if (useReduceStep) { - logger.info("Use one Reduce task to produce a single file"); - job.setReducerClass(reducerClass); - job.setNumReduceTasks(1); - } else { - VariantMapReduceUtil.setNoneReduce(job); - } - - VariantQueryParser variantQueryParser = new HadoopVariantQueryParser(null, getMetadataManager()); - ParsedVariantQuery variantQuery = variantQueryParser.parseQuery(query, options); - Query query = variantQuery.getQuery(); - if (VariantHBaseQueryParser.isSupportedQuery(query)) { - logger.info("Init MapReduce job reading from HBase"); - boolean useSampleIndex = !getConf().getBoolean("skipSampleIndex", false) && SampleIndexQueryParser.validSampleIndexQuery(query); - if (useSampleIndex) { - // Remove extra fields from the query - new SampleIndexDBAdaptor(getHBaseManager(), getTableNameGenerator(), getMetadataManager()).parseSampleIndexQuery(query); - - logger.info("Use sample index to read from HBase"); - } - - VariantHBaseQueryParser parser = new VariantHBaseQueryParser(getMetadataManager()); - List scans = parser.parseQueryMultiRegion(variantQuery, options); - VariantMapReduceUtil.configureMapReduceScans(scans, getConf()); - - VariantMapReduceUtil.initVariantMapperJobFromHBase(job, variantTable, scans, mapperClass, useSampleIndex); - } else { - logger.info("Init MapReduce job reading from Phoenix"); - String sql = new VariantSqlQueryParser(variantTable, getMetadataManager(), getHelper().getConf()) - .parse(variantQuery, options); - - VariantMapReduceUtil.initVariantMapperJobFromPhoenix(job, variantTable, sql, mapperClass); - } - - setNoneTimestamp(job); - - FileOutputFormat.setOutputPath(job, outdir); // set Path - - VariantMapReduceUtil.configureVariantConverter(job.getConfiguration(), false, true, true, - query.getString(VariantQueryParam.UNKNOWN_GENOTYPE.key(), "./.")); - - job.getConfiguration().set(VariantFileOutputFormat.VARIANT_OUTPUT_FORMAT, outputFormat.name()); - - return job; } - @Override - protected void postExecution(boolean succeed) throws IOException, StorageEngineException { - super.postExecution(succeed); - if (succeed) { - if (localOutput != null) { - concatMrOutputToLocal(outdir, localOutput); - } - } - if (localOutput != null) { - deleteTemporaryFile(outdir); - } - } @Override protected String getJobOperationName() { @@ -247,9 +138,9 @@ protected String getJobOperationName() { /** * Mapper to convert to Variant. - * The output of this mapper should be connected directly to the {@link VariantOutputFormat} + * The output of this mapper should be connected directly to the {@link VariantWriterFactory.VariantOutputFormat} * This mapper can not work with a reduce step. - * @see VariantOutputFormat + * @see VariantWriterFactory.VariantOutputFormat */ public static class VariantExporterDirectMapper extends VariantMapper { @Override @@ -308,9 +199,9 @@ protected void map(Object key, Variant value, Context context) throws IOExceptio /** * Reducer to join all VariantAvro and generate Variants. - * The output of this reducer should be connected to the {@link VariantOutputFormat} + * The output of this reducer should be connected to the {@link VariantWriterFactory.VariantOutputFormat} * @see AvroVariantExporterMapper - * @see VariantOutputFormat + * @see VariantWriterFactory.VariantOutputFormat */ public static class VariantExporterReducer extends Reducer, Variant, NullWritable> { @Override diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java new file mode 100644 index 00000000000..bb31552ad64 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java @@ -0,0 +1,140 @@ +package org.opencb.opencga.storage.hadoop.variant.mr; + +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.DeflateCodec; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.JobContext; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.Tool; +import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; +import org.opencb.opencga.storage.hadoop.utils.ValueOnlyTextOutputFormat; +import org.opencb.opencga.storage.hadoop.variant.io.VariantDriver; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.lang.invoke.MethodHandles; +import java.util.Map; + +public class StreamVariantDriver extends VariantDriver { + + public static final String INPUT_FORMAT_PARAM = "inputFormat"; + public static final String COMMAND_LINE_PARAM = "commandLine"; + public static final String COMMAND_LINE_BASE64_PARAM = "commandLineBase64"; + public static final String MAX_BYTES_PER_MAP_PARAM = "maxBytesPerMap"; + + private VariantWriterFactory.VariantOutputFormat format; + private int maxBytesPerMap; + private static Logger logger = LoggerFactory.getLogger(StreamVariantDriver.class); + private String commandLine; + + private Class mapperClass; + private Class reducerClass; + private Class outputFormatClass; + + @Override + protected Map getParams() { + Map params = super.getParams(); + params.put(INPUT_FORMAT_PARAM, ""); + params.put(COMMAND_LINE_PARAM, ""); + params.put(COMMAND_LINE_BASE64_PARAM, ""); + + return params; + } + + @Override + protected void parseAndValidateParameters() throws IOException { + super.parseAndValidateParameters(); + + String inputFormat = getParam(INPUT_FORMAT_PARAM); + if (inputFormat == null) { + throw new IllegalArgumentException("Missing input format!"); + } + format = VariantWriterFactory.toOutputFormat(inputFormat, ""); + if (format == null) { + throw new IllegalArgumentException("Unknown input format " + inputFormat); + } + maxBytesPerMap = Integer.parseInt(getParam(MAX_BYTES_PER_MAP_PARAM, String.valueOf(1024 * 1024 * 1024))); + + commandLine = getParam(COMMAND_LINE_PARAM); + String commandLineBase64 = getParam(COMMAND_LINE_BASE64_PARAM); + if (commandLine == null && commandLineBase64 == null) { + throw new IllegalArgumentException("Missing command line!"); + } + if (commandLine != null && commandLineBase64 != null) { + throw new IllegalArgumentException("Only one of '" + COMMAND_LINE_PARAM + "' or '" + COMMAND_LINE_BASE64_PARAM + "'" + + " is allowed!"); + } + + if (commandLineBase64 != null) { + commandLine = new String(java.util.Base64.getDecoder().decode(commandLineBase64)); + } + + String outdirStr = getParam(OUTPUT_PARAM); + if (StringUtils.isEmpty(outdirStr)) { + throw new IllegalArgumentException("Missing argument " + OUTPUT_PARAM); + } + } + + @Override + protected Class getMapperClass() { + return mapperClass; + } + + @Override + protected Class getReducerClass() { + return reducerClass; + } + + @Override + protected Class getOutputFormatClass() { + return outputFormatClass; + } + + @Override + protected void setupJob(Job job) throws IOException { + + job.getConfiguration().setBoolean(JobContext.MAP_OUTPUT_COMPRESS, true); + job.getConfiguration().setClass(JobContext.MAP_OUTPUT_COMPRESS_CODEC, DeflateCodec.class, CompressionCodec.class); + + Class keyClass = ImmutableBytesWritable.class; +// Class keyClass = NullWritable.class; +// Class keyClass = Text.class; + Class valueClass = Text.class; + + mapperClass = StreamVariantMapper.class; + job.setMapOutputKeyClass(keyClass); + job.setMapOutputValueClass(valueClass); + + StreamVariantMapper.setCommandLine(job, commandLine); + StreamVariantMapper.setVariantFormat(job, format); + StreamVariantMapper.setMaxInputBytesPerProcess(job, maxBytesPerMap); + + reducerClass = Reducer.class; + + outputFormatClass = ValueOnlyTextOutputFormat.class; + job.setOutputFormatClass(ValueOnlyTextOutputFormat.class); + TextOutputFormat.setCompressOutput(job, true); + TextOutputFormat.setOutputCompressorClass(job, GzipCodec.class); +// TextOutputFormat.setOutputCompressorClass(job, DeflateCodec.class); + job.setOutputKeyClass(keyClass); + job.setOutputValueClass(valueClass); + } + + @Override + protected String getJobOperationName() { + return "stream-variants"; + } + + @SuppressWarnings("unchecked") + public static void main(String[] args) { + main(args, (Class) MethodHandles.lookup().lookupClass()); + } + +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java new file mode 100644 index 00000000000..163d6bd9646 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -0,0 +1,394 @@ +package org.opencb.opencga.storage.hadoop.variant.mr; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.util.LineReader; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.commons.datastore.core.Query; +import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.commons.io.DataWriter; +import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; +import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; +import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseVariantStorageMetadataDBAdaptorFactory; + +import java.io.*; +import java.util.Base64; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicReference; + +import static org.opencb.opencga.storage.hadoop.variant.mr.VariantsTableMapReduceHelper.COUNTER_GROUP_NAME; + +public class StreamVariantMapper extends VariantMapper { + private static final Log LOG = LogFactory.getLog(StreamVariantMapper.class); + + private static final int BUFFER_SIZE = 128 * 1024; + public static final String MAX_INPUT_BYTES_PER_PROCESS = "stream.maxInputBytesPerProcess"; + public static final String VARIANT_FORMAT = "stream.variant.format"; + public static final String STREAMPROCESSOR = "stream.map.streamprocessor"; + + private final boolean verboseStdout = false; + private static final long REPORTER_OUT_DELAY = 10 * 1000L; + private static final long REPORTER_ERR_DELAY = 10 * 1000L; + + // Configured at SETUP + private String commandLine; + private int maxInputBytesPerProcess; + private VariantWriterFactory.VariantOutputFormat format; + private Map envs; + private VariantStorageMetadataManager metadataManager; + private VariantWriterFactory writerFactory; + private Query query; + private QueryOptions options; + // Keep an auto-incremental number for each produced record. This is used as the key for the output record, + // and will ensure a sorted output. + private int outputKeyNum; + + // Configured for every new process + private Process process; + private DataOutputStream stdin; + private DataInputStream stdout; + private DataInputStream stderr; + private MRErrorThread stderrThread; + private MROutputThread stdoutThread; + private DataWriter variantDataWriter; + private int processedBytes = 0; + private long numRecordsRead = 0; + private long numRecordsWritten = 0; + protected final AtomicReference throwable = new AtomicReference<>(); + + private volatile boolean processProvidedStatus_ = false; + + public static void setCommandLine(Job job, String commandLine) { + String commandLineBase64 = Base64.getEncoder().encodeToString(commandLine.getBytes()); + job.getConfiguration().set(STREAMPROCESSOR, commandLineBase64); + } + + public static void setVariantFormat(Job job, VariantWriterFactory.VariantOutputFormat format) { + job.getConfiguration().set(VARIANT_FORMAT, format.toString()); + } + + public static void setMaxInputBytesPerProcess(Job job, int maxInputBytesPerProcess) { + job.getConfiguration().setInt(MAX_INPUT_BYTES_PER_PROCESS, maxInputBytesPerProcess); + } + + @Override + protected void setup(Context context) throws IOException, InterruptedException { + super.setup(context); + Configuration conf = context.getConfiguration(); + commandLine = new String(Base64.getDecoder().decode(conf.get(STREAMPROCESSOR))); + maxInputBytesPerProcess = conf.getInt(MAX_INPUT_BYTES_PER_PROCESS, 1024 * 1024 * 1024); + format = VariantWriterFactory.toOutputFormat(conf.get(VARIANT_FORMAT), ""); + if (!format.isPlain()) { + format = format.inPlan(); + } + + + envs = new HashMap<>(); + addEnvironment(envs, conf.get("stream.addenvironment")); + // add TMPDIR environment variable with the value of java.io.tmpdir + envs.put("TMPDIR", System.getProperty("java.io.tmpdir")); + + VariantTableHelper helper = new VariantTableHelper(conf); + metadataManager = new VariantStorageMetadataManager(new HBaseVariantStorageMetadataDBAdaptorFactory(helper)); + writerFactory = new VariantWriterFactory(metadataManager); + query = VariantMapReduceUtil.getQueryFromConfig(conf); + options = VariantMapReduceUtil.getQueryOptionsFromConfig(conf); + outputKeyNum = context.getCurrentKey().hashCode(); + } + + @Override + public void run(Context context) throws IOException, InterruptedException { + if (context.nextKeyValue()) { + try { + setup(context); + startProcess(context); + // Do-while instead of "while", as we've already called context.nextKeyValue() once + do { + if (processedBytes > maxInputBytesPerProcess) { + LOG.info("Processed bytes = " + processedBytes + " > " + maxInputBytesPerProcess + ". Restarting process."); + context.getCounter(COUNTER_GROUP_NAME, "RESTARTED_PROCESS").increment(1); + closeProcess(context); + startProcess(context); + } + map(context.getCurrentKey(), context.getCurrentValue(), context); + } while (!hasExceptions() && context.nextKeyValue()); + } catch (Throwable th) { + setException(th); + } + try { + // Always call cleanup, even if there was an exception + cleanup(context); + } catch (Throwable th) { + setException(th); + } + } else { + context.getCounter(COUNTER_GROUP_NAME, "EMPTY_INPUT_SPLIT").increment(1); + } + throwExceptionIfAny(); + } + + private boolean hasExceptions() { + return throwable.get() != null; + } + + private void setException(Throwable th) { + if (!throwable.compareAndSet(null, th)) { + synchronized (throwable) { + // addSuppressed is not thread safe + throwable.get().addSuppressed(th); + } + } + LOG.warn("{}", th); + } + + private void throwExceptionIfAny() throws IOException { + if (hasExceptions()) { + Throwable cause = throwable.get(); + throwable.set(null); + throw new IOException("MROutput/MRErrThread failed:", cause); + } + } + + @Override + protected void cleanup(Mapper.Context context) throws IOException, InterruptedException { + closeProcess(context); + super.cleanup(context); + } + + @Override + protected void map(Object key, Variant value, Context context) throws IOException, InterruptedException { + numRecordsRead++; + variantDataWriter.write(value); + stdin.flush(); + processedBytes = stdin.size(); + } + + private void closeProcess(Context context) throws IOException, InterruptedException { + + try { + if (variantDataWriter != null) { + variantDataWriter.post(); + variantDataWriter.close(); + } + + // Close stdin to the process. This will cause the process to finish. + if (stdin != null) { + stdin.close(); + stdin = null; + } + + if (process != null) { + // Wait for the process to finish + int exitVal = process.waitFor(); + + if (exitVal != 0) { + LOG.error("Process exited with code " + exitVal); + throw new IOException("Process exited with code " + exitVal); + } + process = null; + } + } catch (Throwable th) { + setException(th); + } + + try { + if (stdout != null) { + stdoutThread.join(); + stdout.close(); + stdout = null; + } + } catch (Throwable th) { + setException(th); + } + try { + if (stderr != null) { + stderrThread.join(); + stderr.close(); + stderr = null; + } + } catch (Throwable th) { + setException(th); + } +// drainStdout(context); + } + + private void startProcess(Context context) throws IOException { + LOG.info("bash -ce '" + commandLine + "'"); + context.getCounter(COUNTER_GROUP_NAME, "START_PROCESS").increment(1); + + // Start the process + ProcessBuilder builder = new ProcessBuilder("bash", "-ce", commandLine); + builder.environment().putAll(envs); + process = builder.start(); + + stdin = new DataOutputStream(new BufferedOutputStream( + process.getOutputStream(), + BUFFER_SIZE)); + stdout = new DataInputStream(new BufferedInputStream( + process.getInputStream(), + BUFFER_SIZE)); + + stderr = new DataInputStream(new BufferedInputStream(process.getErrorStream())); + + stderrThread = new MRErrorThread(context); + stdoutThread = new MROutputThread(context); + stderrThread.start(); + stdoutThread.start(); + + variantDataWriter = writerFactory.newDataWriter(format, stdin, new Query(query), new QueryOptions(options)); + + processedBytes = 0; + numRecordsRead = 0; + numRecordsWritten = 0; + throwable.set(null); + + variantDataWriter.open(); + variantDataWriter.pre(); + stdin.flush(); + + } + + void addEnvironment(Map env, String nameVals) { + // encoding "a=b c=d" from StreamJob + if (nameVals == null) { + return; + } + String[] nv = nameVals.split(" "); + for (int i = 0; i < nv.length; i++) { + String[] pair = nv[i].split("=", 2); + if (pair.length != 2) { + LOG.info("Skip env entry:" + nv[i]); + } else { + env.put(pair[0], pair[1]); + } + } + } + + + private class MROutputThread extends Thread { + + private final Mapper.Context context; + private long lastStdoutReport = 0; + + MROutputThread(Context context) { + this.context = context; + setDaemon(true); + } + + public void run() { + Text line = new Text(); + LineReader stdoutLineReader = new LineReader(stdout); + try { + while (stdoutLineReader.readLine(line) > 0) { + context.write(new ImmutableBytesWritable(Bytes.toBytes(outputKeyNum++)), line); +// context.write(null, line); + if (verboseStdout) { + LOG.info("[STDOUT] - " + line); + } + numRecordsWritten++; + long now = System.currentTimeMillis(); + if (now - lastStdoutReport > REPORTER_OUT_DELAY) { + lastStdoutReport = now; + String hline = "Records R/W=" + numRecordsRead + "/" + numRecordsWritten; + if (!processProvidedStatus_) { + context.setStatus(hline); + } else { + context.progress(); + } + LOG.info(hline); + } + } + } catch (Throwable th) { + setException(th); + } + } + } + + private class MRErrorThread extends Thread { + + private final Configuration conf; + private final Mapper.Context context; + private long lastStderrReport = 0; + private final String reporterPrefix; + private final String counterPrefix; + private final String statusPrefix; + + MRErrorThread(Context context) { + this.context = context; + this.conf = context.getConfiguration(); + this.reporterPrefix = conf.get("stream.stderr.reporter.prefix", "reporter:"); + this.counterPrefix = reporterPrefix + "counter:"; + this.statusPrefix = reporterPrefix + "status:"; + setDaemon(true); + } + + public void run() { + Text line = new Text(); + LineReader stderrLineReader = new LineReader(stderr); + try { + while (stderrLineReader.readLine(line) > 0) { + String lineStr = line.toString(); + if (matchesReporter(lineStr)) { + if (matchesCounter(lineStr)) { + incrCounter(lineStr); + } else if (matchesStatus(lineStr)) { + processProvidedStatus_ = true; + setStatus(lineStr); + } else { + LOG.warn("Cannot parse reporter line: " + lineStr); + } + } else { + LOG.info("[STDERR] - " + lineStr); +// System.err.println(lineStr); + } + long now = System.currentTimeMillis(); + if (now - lastStderrReport > REPORTER_ERR_DELAY) { + lastStderrReport = now; + context.progress(); + } + line.clear(); + } + } catch (Throwable th) { + setException(th); + } + } + + private boolean matchesReporter(String line) { + return line.startsWith(reporterPrefix); + } + + private boolean matchesCounter(String line) { + return line.startsWith(counterPrefix); + } + + private boolean matchesStatus(String line) { + return line.startsWith(statusPrefix); + } + + private void incrCounter(String line) { + String trimmedLine = line.substring(counterPrefix.length()).trim(); + String[] columns = trimmedLine.split(","); + if (columns.length == 2) { + try { + context.getCounter(COUNTER_GROUP_NAME, columns[0]).increment(Long.parseLong(columns[1])); + } catch (NumberFormatException e) { + LOG.warn("Cannot parse counter increment '" + columns[1] + "' from line: " + line); + } + } else { + LOG.warn("Cannot parse counter line: " + line); + } + } + + private void setStatus(String line) { + context.setStatus(line.substring(statusPrefix.length()).trim()); + } + } + +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/variant_walker.py b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/variant_walker.py new file mode 100644 index 00000000000..4d56e92c45c --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/variant_walker.py @@ -0,0 +1,157 @@ +import sys +import importlib +import os +from abc import ABC, abstractmethod + +class VariantWalker(ABC): + @abstractmethod + def setup(self, *arg): + """ + This function is responsible for setting up any necessary configurations + before processing the entries. + *args: Configuration arguments. + """ + pass + + @abstractmethod + def header(self, header): + """ + This function will process the header as a list of strings. + header (list): A list of strings representing the header. + """ + pass + + @abstractmethod + def map(self, line): + """ + This function processes each entry. + + Args: + line (str): A line read from stdin. + """ + pass + + @abstractmethod + def cleanup(self): + """ + This function is responsible for any cleanup tasks after all entries have been processed. + """ + pass + + def count(self, key, increment): + """ + Increment a counter with a given value. + + Args: + key (str): Counter name + increment (int): Counter increment + """ + if not all(char.isalnum() or char in ['_', '-'] for char in key): + raise ValueError("Invalid key. Key can only contain alphanumeric characters, underscores, and hyphens.") + + print(f"reporter:counter:{key},{increment}", file=sys.stderr) + + def write(self, value): + """ + Write a value to stdout. + + Args: + value (str): The value to write. + """ + print(value) + + def jsonHeaderToVcfHeader(self, jsonHeader): + """ + Convert a JSON header to a VCF header. + + Args: + jsonHeader (dict): The JSON header to convert. + """ + # TODO: Implement this method + return "" + + + def getTmpdir(self): + """ + Get the output directory. + + Returns: + str: The output directory. + """ + return os.environ.get("TMPDIR", "/tmp") + + + + +def main(module_name, class_name, *args): + """ + This is the main function that sets up the environment, reads lines from stdin, + processes them using the map function, and performs cleanup tasks. + + Args: + module_name (str): The name of the module where the VariantWalker subclass is defined. + class_name (str): The name of the VariantWalker subclass to use. + *args: Additional arguments to pass to the setup method of the VariantWalker subclass. + """ + ## If the modulename is a fileName, use the source file loader to load the module + if module_name.endswith(".py"): + ## If the modulename is a relative path, we need to make it an absolute path prepending the current working dir + if not module_name.startswith("/"): + module_name = f"{os.getcwd()}/{module_name}" + + loader = importlib.machinery.SourceFileLoader( 'walker_module', module_name ) + spec = importlib.util.spec_from_loader( 'walker_module', loader ) + module = importlib.util.module_from_spec( spec ) + loader.exec_module( module ) + else: + module = importlib.import_module(module_name) + + WalkerClass = getattr(module, class_name) + walker = WalkerClass() + + try: + walker.setup(*args) + except Exception as e: + print(f"An error occurred during setup: {e}", file=sys.stderr) + raise + + num_entries = 0 + size_entries = 0 + + header_read = False + header = [] + for line in sys.stdin: + num_entries = num_entries + 1 + size_entries = size_entries + len(line) + # Now 'line' does not have trailing '\n' or '\r' + line = line.rstrip() + + ## The line will be a header line if it starts with '#' or if it's the first line + if not header_read: + if line.startswith("#") or num_entries == 1: + header.append(line) + continue + else: + header_read = True + walker.header(header) + + try: + walker.map(line) + except Exception as e: + print(f"An error occurred while processing the line: {e}", file=sys.stderr) + raise + + walker.count("num_entries", num_entries) + walker.count("size_entries", size_entries) + try: + walker.cleanup() + except Exception as e: + print(f"An error occurred during cleanup: {e}", file=sys.stderr) + raise + return 0 + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("Usage: python variant_walker.py [args...]", file=sys.stderr) + sys.exit(1) + sys.exit(main(sys.argv[1], sys.argv[2], *sys.argv[3:])) \ No newline at end of file diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/walker_example.py b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/walker_example.py new file mode 100644 index 00000000000..2c5c92fd6ae --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/walker_example.py @@ -0,0 +1,51 @@ +import argparse +from variant_walker import VariantWalker + +class Echo(VariantWalker): + def setup(self, *arg): + pass + + def header(self, header): + self.write(header) + + def map(self, line): + self.write(line) + pass + + def cleanup(self): + pass + +class Cut(VariantWalker): + def setup(self, *args): + parser = argparse.ArgumentParser() + parser.add_argument('--length', default=10, help='The length to trim each line to.') + args = parser.parse_args(args) + self.length = int(args.length) + + def header(self, header): + # Print last line from header + self.write(header[-1]) + pass + + def map(self, line): + self.write(line[:self.length]) + + def cleanup(self): + pass + +class Simplify(VariantWalker): + def setup(self, *args): + pass + + def header(self, header): + # Print last line from header + self.write(header[-1]) + + def map(self, line): + # Split line by tab + fields = line.split('\t') + # Write fields 0, 1, 3, 4 joined by ':' + self.write(':'.join([fields[0], fields[1], fields[3], fields[4]])) + + def cleanup(self): + pass diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHadoopStoragePipelineTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHadoopStoragePipelineTest.java index 773f017a67c..33b67fb5b12 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHadoopStoragePipelineTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHadoopStoragePipelineTest.java @@ -40,6 +40,7 @@ import org.opencb.opencga.storage.core.variant.VariantStorageOptions; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.core.variant.adaptors.iterators.VariantDBIterator; +import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.hadoop.utils.HBaseManager; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantPhoenixKeyFactory; @@ -284,4 +285,52 @@ public void printVariants() throws Exception { VariantHbaseTestUtils.printVariants(studyMetadata, dbAdaptor, outDir); } + + @Test + public void exportCommand() throws Exception { + URI outdir = newOutputUri(); + List cmdList = Arrays.asList( + "export NUM_VARIANTS=0 ;", + "function setup() {", + " echo \"#SETUP\" ;", + " echo '## Something in single quotes' ; ", + "} ;", + "function map() {", +// " echo \"[$NUM_VARIANTS] $1\" 1>&2 ;", + " echo \"[$NUM_VARIANTS] \" 1>&2 ;", + " echo \"$1\" | jq .id ;", + " NUM_VARIANTS=$((NUM_VARIANTS+1)) ;", + "};", + "function cleanup() {", + " echo \"CLEANUP\" ;", + " echo \"NumVariants = $NUM_VARIANTS\" ;", + "};", + "setup;", + "while read -r i ; do ", + " map \"$i\" ; ", + "done; ", + "cleanup;"); + + // TODO: Add docker prune + + // String cmd = "bash -c '" + String.join("\n", cmdList) + "'"; + String cmd = String.join("\n", cmdList); + String cmdBash = "bash -ce '" + cmd.replace("'", "'\"'\"'") + "'"; + String cmdDocker = "docker run --rm -i opencb/opencga-base bash -ce '" + cmd.replace("'", "'\"'\"'") + "'"; + String cmdPython1 = "python variant_walker.py walker_example Cut --length 30"; +// String cmdPython2 = "python /home/jacobo/appl/opencga/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/* opencga-storage-hadoop-walker-example MyWalker --length 30"; + + +// variantStorageEngine.walkData(outdir.resolve("variant3.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmdDocker); +// variantStorageEngine.walkData(outdir.resolve("variant2.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmdBash); +// variantStorageEngine.walkData(outdir.resolve("variant1.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmd); +// variantStorageEngine.walkData(outdir.resolve("variant5.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmdPython1); +// variantStorageEngine.walkData(outdir.resolve("variant8.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmdPython2); +// variantStorageEngine.walkData(outdir.resolve("variant6.txt.gz"), VariantWriterFactory.VariantOutputFormat.VCF, new Query(), new QueryOptions(), cmdPython); +// variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), "opencb/opencga-base", cmd); +// variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), "opencb/opencga-base", cmdPython1); + variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), "my-python-app:latest", cmdPython1); + + } + } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporterTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporterTest.java index f7f89519477..32f0151d676 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporterTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporterTest.java @@ -176,7 +176,7 @@ public void exportTped() throws Exception { public void exportJson() throws Exception { String fileName = "variants.json"; URI uri = getOutputUri(fileName); - variantStorageEngine.exportData(uri, VariantWriterFactory.VariantOutputFormat.JSON, null, new Query(STUDY.key(), study1), new QueryOptions()); + variantStorageEngine.exportData(uri, VariantWriterFactory.VariantOutputFormat.JSON, null, new VariantQuery().study(study1).includeSampleAll(), new QueryOptions()); copyToLocal(fileName, uri); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file1.genome.vcf b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file1.genome.vcf index ad5044c0fbf..9457d3446f8 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file1.genome.vcf +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file1.genome.vcf @@ -2,6 +2,7 @@ ##FORMAT= ##FORMAT= ##INFO= +##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1 1 1 . N . . . END=10003 GT:DP .:. 1 10004 . C . . . END=10010 GT:DP 0/0:3 diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file2.genome.vcf b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file2.genome.vcf index 9796f163be2..f240e02b859 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file2.genome.vcf +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/gaps/file2.genome.vcf @@ -7,6 +7,7 @@ ##MULTI_OVERLAP=1:10013:T:C and 1:10014:A:T with 1:10011:ATTT:A ##INSERTION_GAP=1:10031:T:TAAA does not overlap with any from here ##PARTIAL_REFERENCE_BLOCK=1:10044-10053 does not have DP field. +##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s2 1 1 . N . . . END=10003 GT:DP .:. 1 10004 . C . . . END=10012 GT:DP 0/0:3 From 9ea00ebc21068fed1f919e1e975161e65b43c25b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 10 Oct 2024 10:48:31 +0100 Subject: [PATCH 02/66] storage: Add STDERR to exception thrown. Fix max_bytes_per_map. #TASK-6722 --- .../analysis/variant/VariantWalkerTool.java | 22 +++-- .../manager/VariantStorageManager.java | 26 ++++-- .../manager/VariantStorageManagerTest.java | 16 ++++ .../core/variant/VariantStorageEngine.java | 19 +++- .../core/variant/VariantStorageOptions.java | 2 +- .../core/variant/io/VariantWriterFactory.java | 2 +- .../main/resources/storage-configuration.yml | 10 +++ .../dummy/DummyVariantStorageEngine.java | 2 +- .../variant/HadoopVariantStorageEngine.java | 39 +++++++-- .../variant/HadoopVariantStorageOptions.java | 17 ++++ .../variant/mr/StreamVariantDriver.java | 18 ++++ .../variant/mr/StreamVariantMapper.java | 87 +++++++++++++------ 12 files changed, 205 insertions(+), 55 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java index 3e826de405b..a3eddd7eefe 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java @@ -26,7 +26,6 @@ import org.opencb.opencga.core.models.variant.VariantWalkerParams; import org.opencb.opencga.core.tools.annotations.Tool; import org.opencb.opencga.core.tools.annotations.ToolParams; -import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import java.net.URI; @@ -55,6 +54,15 @@ protected void check() throws Exception { } format = VariantWriterFactory.toOutputFormat(toolParams.getOutputFileName(), toolParams.getOutputFileName()); + if (!format.isPlain()) { + format = format.inPlain(); + } + + if (StringUtils.isEmpty(toolParams.getOutputFileName())) { + toolParams.setOutputFileName("output." + format.toString().toLowerCase() + ".gz"); + } else if (!toolParams.getOutputFileName().endsWith(".gz")) { + toolParams.setOutputFileName(toolParams.getOutputFileName() + ".gz"); + } } @Override @@ -70,15 +78,11 @@ protected void run() throws Exception { // The scratch directory is expected to be faster than the final directory // This also avoids moving files to final directory if the tool fails Path outDir = getScratchDir(); - String outputFile = StringUtils.isEmpty(toolParams.getOutputFileName()) - ? outDir.toString() - : outDir.resolve(toolParams.getOutputFileName()).toString(); + String outputFile = outDir.resolve(toolParams.getOutputFileName()).toString(); Query query = toolParams.toQuery(); - QueryOptions queryOptions = new QueryOptions(params); - for (VariantQueryParam param : VariantQueryParam.values()) { - queryOptions.remove(param.key()); - } - uris.addAll(variantStorageManager.walkData(outputFile, + QueryOptions queryOptions = new QueryOptions().append(QueryOptions.INCLUDE, toolParams.getInclude()) + .append(QueryOptions.EXCLUDE, toolParams.getExclude()); + uris.add(variantStorageManager.walkData(outputFile, format, query, queryOptions, toolParams.getDockerImage(), toolParams.getCommandLine(), token)); }); step("move-files", () -> { diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java index d1e276fbf34..f292e6d6a33 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java @@ -88,6 +88,7 @@ import org.opencb.opencga.storage.core.variant.VariantStorageOptions; import org.opencb.opencga.storage.core.variant.adaptors.*; import org.opencb.opencga.storage.core.variant.adaptors.iterators.VariantDBIterator; +import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat; import org.opencb.opencga.storage.core.variant.query.ParsedQuery; import org.opencb.opencga.storage.core.variant.query.VariantQueryResult; @@ -98,6 +99,7 @@ import java.io.IOException; import java.net.URI; +import java.net.URISyntaxException; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.TimeUnit; @@ -202,14 +204,19 @@ public List exportData(String outputFile, VariantOutputFormat outputFormat, * @throws StorageEngineException If there is any error exporting variants * @return generated files */ - public List walkData(String outputFile, VariantOutputFormat format, + public URI walkData(String outputFile, VariantOutputFormat format, Query query, QueryOptions queryOptions, String dockerImage, String commandLine, String token) throws CatalogException, StorageEngineException { String anyStudy = catalogUtils.getAnyStudy(query, token); return secureAnalysis(VariantWalkerTool.ID, anyStudy, queryOptions, token, engine -> { Query finalQuery = catalogUtils.parseQuery(query, queryOptions, engine.getCellBaseUtils(), token); checkSamplesPermissions(finalQuery, queryOptions, token); - URI outputUri = new VariantExportOperationManager(this, engine).getOutputUri(outputFile, format, finalQuery, token); + URI outputUri; + try { + outputUri = UriUtils.createUri(outputFile); + } catch (URISyntaxException e) { + throw new IllegalArgumentException(e); + } return engine.walkData(outputUri, format, finalQuery, queryOptions, dockerImage, commandLine); }); } @@ -533,7 +540,7 @@ public boolean hasVariantSetup(String studyStr, String token) throws CatalogExce public ObjectMap configureProject(String projectStr, ObjectMap params, String token) throws CatalogException, StorageEngineException { return secureOperationByProject("configure", projectStr, params, token, engine -> { - validateNewConfiguration(engine, params); + validateNewConfiguration(engine, params, token); DataStore dataStore = getDataStoreByProjectId(projectStr, token); @@ -546,7 +553,7 @@ public ObjectMap configureProject(String projectStr, ObjectMap params, String to public ObjectMap configureStudy(String studyStr, ObjectMap params, String token) throws CatalogException, StorageEngineException { return secureOperation("configure", studyStr, params, token, engine -> { - validateNewConfiguration(engine, params); + validateNewConfiguration(engine, params, token); Study study = catalogManager.getStudyManager() .get(studyStr, new QueryOptions(INCLUDE, StudyDBAdaptor.QueryParams.INTERNAL_CONFIGURATION_VARIANT_ENGINE_OPTIONS.key()), @@ -570,12 +577,13 @@ public ObjectMap configureStudy(String studyStr, ObjectMap params, String token) }); } - private void validateNewConfiguration(VariantStorageEngine engine, ObjectMap params) throws StorageEngineException { - for (VariantStorageOptions option : VariantStorageOptions.values()) { - if (option.isProtected() && params.get(option.key()) != null) { - throw new StorageEngineException("Unable to update protected option '" + option.key() + "'"); - } + private void validateNewConfiguration(VariantStorageEngine engine, ObjectMap params, String token) + throws StorageEngineException, CatalogException { + if (catalogManager.getAuthorizationManager().isOpencgaAdministrator(catalogManager.getUserManager().validateToken(token))) { + logger.info("Skip configuration validation. User is an admin."); + return; } + engine.validateNewConfiguration(params); } /** diff --git a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManagerTest.java b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManagerTest.java index 4aeedde871f..6f371c7fa82 100644 --- a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManagerTest.java +++ b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManagerTest.java @@ -35,6 +35,7 @@ import org.opencb.opencga.core.testclassification.duration.MediumTests; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; +import org.opencb.opencga.storage.core.variant.VariantStorageOptions; import java.util.Collections; import java.util.HashSet; @@ -101,6 +102,21 @@ public void testConfigure() throws CatalogException, StorageEngineException { assertNotNull(vse2.getOptions().get("KeyFromTheSecondStudy")); } + @Test + public void testConfigureProtectedValues() throws Exception { + VariantStorageOptions key = VariantStorageOptions.WALKER_DOCKER_MEMORY; + assertTrue(key.isProtected()); + ObjectMap conf = new ObjectMap(key.key(), "30g"); + + String fqn = catalogManager.getProjectManager().get(projectId, null, sessionId).first().getFqn(); + + variantManager.configureProject(fqn, new ObjectMap(conf), opencga.getAdminToken()); + + thrown.expect(StorageEngineException.class); + thrown.expectMessage("Unable to update protected option '" + key.key() + "'"); + variantManager.configureProject(projectId, new ObjectMap(conf), sessionId); + } + @Test public void testConfigureSampleIndex() throws Exception { SampleIndexConfiguration conf = getRandomConf(); diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java index bf46887740f..b10b2c73058 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java @@ -285,7 +285,7 @@ public List exportData(URI outputFile, VariantOutputFormat outputFormat, UR return exporter.export(outputFile, outputFormat, variantsFile, parsedVariantQuery); } - public List walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, Query query, QueryOptions queryOptions, + public URI walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, Query query, QueryOptions queryOptions, String dockerImage, String commandLine) throws IOException, StorageEngineException { if (format == VariantWriterFactory.VariantOutputFormat.VCF || format == VariantWriterFactory.VariantOutputFormat.VCF_GZ) { @@ -304,8 +304,11 @@ public List walkData(URI outputFile, VariantWriterFactory.VariantOutputForm String dockerCommandLine = "docker run --rm -i " + "--memory " + memory + " " - + "--cpus " + cpu + " " - + "--user " + user + " "; + + "--cpus " + cpu + " "; + + if (StringUtils.isNotEmpty(user)) { + dockerCommandLine += "--user " + user + " "; + } if (StringUtils.isNotEmpty(volume)) { dockerCommandLine += "-v " + volume + ":/data "; @@ -323,7 +326,7 @@ public List walkData(URI outputFile, VariantWriterFactory.VariantOutputForm } - public abstract List walkData(URI outputFile, VariantOutputFormat format, Query query, QueryOptions queryOptions, + public abstract URI walkData(URI outputFile, VariantOutputFormat format, Query query, QueryOptions queryOptions, String commandLine) throws StorageEngineException; @@ -1202,6 +1205,14 @@ public abstract void loadVariantScore(URI scoreFile, String study, String scoreN @Override public abstract void testConnection() throws StorageEngineException; + public void validateNewConfiguration(ObjectMap params) throws StorageEngineException { + for (VariantStorageOptions option : VariantStorageOptions.values()) { + if (option.isProtected() && params.get(option.key()) != null) { + throw new StorageEngineException("Unable to update protected option '" + option.key() + "'"); + } + } + } + public void reloadCellbaseConfiguration() { cellBaseUtils = null; } diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java index f7736bd5b1f..b00bd525bd8 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java @@ -102,7 +102,7 @@ public enum VariantStorageOptions implements ConfigurationOption { WALKER_DOCKER_MEMORY("walker.docker.memory", "512m", true), WALKER_DOCKER_CPU("walker.docker.cpu", "1", true), - WALKER_DOCKER_USER("walker.docker.user", "root", true), + WALKER_DOCKER_USER("walker.docker.user", "", true), WALKER_DOCKER_ENV("walker.docker.env", "", true), WALKER_DOCKER_MOUNT("walker.docker.mount", "", true), WALKER_DOCKER_OPTS("walker.docker.opts", "", true), diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/VariantWriterFactory.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/VariantWriterFactory.java index fa002facbd5..61c2e6552d5 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/VariantWriterFactory.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/VariantWriterFactory.java @@ -122,7 +122,7 @@ public boolean isSnappy() { return extension.endsWith(".snappy"); } - public VariantOutputFormat inPlan() { + public VariantOutputFormat inPlain() { if (!isPlain()) { return VariantOutputFormat.valueOf(name().replace("_GZ", "").replace("_SNAPPY", "")); } else { diff --git a/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml b/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml index b9970d18eaf..21c2dd12f79 100644 --- a/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml +++ b/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml @@ -128,6 +128,13 @@ variant: search.intersect.always: false # Force intersect queries search.intersect.params.threshold: 3 # Minimum number of QueryParams in the query to intersect + walker.docker.memory: "512m" # Memory limit for the docker executor + walker.docker.cpu: "1" # CPU limit for the docker executor + walker.docker.user: "" # User to run the docker executor + walker.docker.env: "" # Environment variables to be passed to the docker executor. e.g. key=value,key2=value2 + walker.docker.mount: "" # Volumes to be mounted in the docker executor + walker.docker.opts: "" # Additional docker options + ## The following section defines all available storage engine plugins installed engines: ## Hadoop Storage Engine @@ -177,6 +184,9 @@ variant: # See opencb/opencga#352 for more info. storage.hadoop.mr.scanner.timeout: 300000 + # DOCKER_HOST environment variable to be used by the docker executor inside the MapReduce job + storage.hadoop.mr.stream.docker.host: "" + mapreduce.map.memory.mb: 2048 DeleteHBaseColumnDriver: storage.hadoop.write.mappers.limit.factor: 4 diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java index e10370dcaaf..65a0169ef8d 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java @@ -143,7 +143,7 @@ public void importData(URI input, VariantMetadata metadata, List walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, Query query, QueryOptions queryOptions, String commandLine) throws StorageEngineException { + public URI walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, Query query, QueryOptions queryOptions, String commandLine) throws StorageEngineException { throw new UnsupportedOperationException("Unable to walk data in " + getStorageEngineId()); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java index fdee34d3133..061bc956427 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java @@ -317,20 +317,38 @@ protected VariantExporter newVariantExporter(VariantMetadataFactory metadataFact } @Override - public List walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, + public URI walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, Query query, QueryOptions queryOptions, String commandLine) throws StorageEngineException { ParsedVariantQuery variantQuery = parseQuery(query, queryOptions); int studyId = variantQuery.getStudyQuery().getDefaultStudy().getId(); + ObjectMap params = new ObjectMap(getOptions()).appendAll(variantQuery.getQuery()).appendAll(variantQuery.getInputOptions()); + params.remove(StreamVariantDriver.COMMAND_LINE_PARAM); + + String memory = getOptions().getString(WALKER_DOCKER_MEMORY.key(), WALKER_DOCKER_MEMORY.defaultValue()); + int memoryBytes; + if (memory.endsWith("M") || memory.endsWith("m")) { + memoryBytes = Integer.parseInt(memory.substring(0, memory.length() - 1)) * 1024 * 1024; + } else if (memory.endsWith("G") || memory.endsWith("g")) { + memoryBytes = Integer.parseInt(memory.substring(0, memory.length() - 1)) * 1024 * 1024 * 1024; + } else { + memoryBytes = Integer.parseInt(memory); + } + + String dockerHost = getOptions().getString(MR_STREAM_DOCKER_HOST.key(), MR_STREAM_DOCKER_HOST.defaultValue()); + if (StringUtils.isNotEmpty(dockerHost)) { + params.put(StreamVariantDriver.ENVIRONMENT_VARIABLES, "DOCKER_HOST=" + dockerHost); + } + getMRExecutor().run(StreamVariantDriver.class, StreamVariantDriver.buildArgs( null, getVariantTableName(), studyId, null, - new ObjectMap().appendAll(variantQuery.getQuery()).appendAll(variantQuery.getInputOptions()) - .append(StreamVariantDriver.MAX_BYTES_PER_MAP_PARAM, 1024 * 10) + params + .append(StreamVariantDriver.MAX_BYTES_PER_MAP_PARAM, memoryBytes / 2) .append(StreamVariantDriver.COMMAND_LINE_BASE64_PARAM, Base64.getEncoder().encodeToString(commandLine.getBytes())) .append(StreamVariantDriver.INPUT_FORMAT_PARAM, format.toString()) .append(StreamVariantDriver.OUTPUT_PARAM, outputFile) - ), ""); - return null; + ), "Walk data"); + return outputFile; } @Override @@ -1335,4 +1353,15 @@ public void testConnection() throws StorageEngineException { } } + @Override + public void validateNewConfiguration(ObjectMap params) throws StorageEngineException { + super.validateNewConfiguration(params); + + for (HadoopVariantStorageOptions option : HadoopVariantStorageOptions.values()) { + if (option.isProtected() && params.get(option.key()) != null) { + throw new StorageEngineException("Unable to update protected option '" + option.key() + "'"); + } + } + } + } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java index 817605be87c..363b07e9fbc 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java @@ -60,6 +60,8 @@ public enum HadoopVariantStorageOptions implements ConfigurationOption { MR_EXECUTOR_SSH_HADOOP_SCP_BIN("storage.hadoop.mr.executor.ssh.hadoop-scp.bin", "misc/scripts/hadoop-scp.sh"), MR_EXECUTOR_SSH_HADOOP_TERMINATION_GRACE_PERIOD_SECONDS("storage.hadoop.mr.executor.ssh.terminationGracePeriodSeconds", 120), + MR_STREAM_DOCKER_HOST("storage.hadoop.mr.stream.docker.host", "", true), + ///////////////////////// // Variant table configuration ///////////////////////// @@ -134,6 +136,7 @@ public enum HadoopVariantStorageOptions implements ConfigurationOption { private final String key; private final Object value; + private final boolean isProtected; HadoopVariantStorageOptions(String key) { this(key, null); @@ -142,6 +145,13 @@ public enum HadoopVariantStorageOptions implements ConfigurationOption { HadoopVariantStorageOptions(String key, Object value) { this.key = key; this.value = value; + this.isProtected = false; + } + + HadoopVariantStorageOptions(String key, Object value, boolean isProtected) { + this.key = key; + this.value = value; + this.isProtected = isProtected; } @Override @@ -157,4 +167,11 @@ public String key() { public T defaultValue() { return (T) value; } + + @Override + public boolean isProtected() { + return isProtected; + } + + } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java index bb31552ad64..5a248e190e1 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.lang.invoke.MethodHandles; +import java.util.HashMap; import java.util.Map; public class StreamVariantDriver extends VariantDriver { @@ -28,11 +29,13 @@ public class StreamVariantDriver extends VariantDriver { public static final String COMMAND_LINE_PARAM = "commandLine"; public static final String COMMAND_LINE_BASE64_PARAM = "commandLineBase64"; public static final String MAX_BYTES_PER_MAP_PARAM = "maxBytesPerMap"; + public static final String ENVIRONMENT_VARIABLES = "envVars"; private VariantWriterFactory.VariantOutputFormat format; private int maxBytesPerMap; private static Logger logger = LoggerFactory.getLogger(StreamVariantDriver.class); private String commandLine; + private Map envVars; private Class mapperClass; private Class reducerClass; @@ -76,6 +79,20 @@ protected void parseAndValidateParameters() throws IOException { commandLine = new String(java.util.Base64.getDecoder().decode(commandLineBase64)); } + envVars = new HashMap<>(); + String envVarsStr = getParam(ENVIRONMENT_VARIABLES); + if (StringUtils.isNotEmpty(envVarsStr)) { + String[] split = envVarsStr.split(","); + for (String s : split) { + String[] split1 = s.split("="); + if (split1.length != 2) { + throw new IllegalArgumentException("Invalid environment variable '" + s + "'"); + } + envVars.put(split1[0], split1[1]); + } + } + + String outdirStr = getParam(OUTPUT_PARAM); if (StringUtils.isEmpty(outdirStr)) { throw new IllegalArgumentException("Missing argument " + OUTPUT_PARAM); @@ -115,6 +132,7 @@ protected void setupJob(Job job) throws IOException { StreamVariantMapper.setCommandLine(job, commandLine); StreamVariantMapper.setVariantFormat(job, format); StreamVariantMapper.setMaxInputBytesPerProcess(job, maxBytesPerMap); + StreamVariantMapper.setEnvironment(job, envVars); reducerClass = Reducer.class; diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 163d6bd9646..df5425e8d4d 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -18,10 +18,7 @@ import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseVariantStorageMetadataDBAdaptorFactory; import java.io.*; -import java.util.Base64; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.atomic.AtomicReference; +import java.util.*; import static org.opencb.opencga.storage.hadoop.variant.mr.VariantsTableMapReduceHelper.COUNTER_GROUP_NAME; @@ -30,8 +27,9 @@ public class StreamVariantMapper extends VariantMapper throwable = new AtomicReference<>(); + protected final List throwables = Collections.synchronizedList(new ArrayList<>()); private volatile boolean processProvidedStatus_ = false; public static void setCommandLine(Job job, String commandLine) { String commandLineBase64 = Base64.getEncoder().encodeToString(commandLine.getBytes()); - job.getConfiguration().set(STREAMPROCESSOR, commandLineBase64); + job.getConfiguration().set(COMMANDLINE_BASE64, commandLineBase64); } public static void setVariantFormat(Job job, VariantWriterFactory.VariantOutputFormat format) { @@ -82,16 +80,15 @@ public static void setMaxInputBytesPerProcess(Job job, int maxInputBytesPerProce protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); - commandLine = new String(Base64.getDecoder().decode(conf.get(STREAMPROCESSOR))); + commandLine = new String(Base64.getDecoder().decode(conf.get(COMMANDLINE_BASE64))); maxInputBytesPerProcess = conf.getInt(MAX_INPUT_BYTES_PER_PROCESS, 1024 * 1024 * 1024); format = VariantWriterFactory.toOutputFormat(conf.get(VARIANT_FORMAT), ""); if (!format.isPlain()) { - format = format.inPlan(); + format = format.inPlain(); } - envs = new HashMap<>(); - addEnvironment(envs, conf.get("stream.addenvironment")); + addEnvironment(envs, conf); // add TMPDIR environment variable with the value of java.io.tmpdir envs.put("TMPDIR", System.getProperty("java.io.tmpdir")); @@ -135,24 +132,33 @@ public void run(Context context) throws IOException, InterruptedException { } private boolean hasExceptions() { - return throwable.get() != null; + return !throwables.isEmpty(); } private void setException(Throwable th) { - if (!throwable.compareAndSet(null, th)) { - synchronized (throwable) { - // addSuppressed is not thread safe - throwable.get().addSuppressed(th); - } - } + throwables.add(th); LOG.warn("{}", th); } private void throwExceptionIfAny() throws IOException { if (hasExceptions()) { - Throwable cause = throwable.get(); - throwable.set(null); - throw new IOException("MROutput/MRErrThread failed:", cause); + String message = "StreamVariantMapper failed:"; + if (stderrThread != null) { + String stderr = String.join("\n", stderrThread.stderrBuffer); + message += "\nSTDERR: " + stderr; + } + if (throwables.size() == 1) { + Throwable cause = throwables.get(0); + throwables.clear(); + throw new IOException(message, cause); + } else { + IOException exception = new IOException(message); + for (int i = 1; i < throwables.size(); i++) { + exception.addSuppressed(throwables.get(i)); + } + throwables.clear(); + throw exception; + } } } @@ -247,7 +253,6 @@ private void startProcess(Context context) throws IOException { processedBytes = 0; numRecordsRead = 0; numRecordsWritten = 0; - throwable.set(null); variantDataWriter.open(); variantDataWriter.pre(); @@ -255,7 +260,30 @@ private void startProcess(Context context) throws IOException { } - void addEnvironment(Map env, String nameVals) { + public static void setEnvironment(Job job, Map env) { + if (env == null || env.isEmpty()) { + return; + } + StringBuilder sb = new StringBuilder(); + for (Map.Entry entry : env.entrySet()) { + if (entry.getKey().contains(" ") || entry.getValue().contains(" ")) { + throw new IllegalArgumentException("Environment variables cannot contain spaces: " + + "'" + entry.getKey() + "' = '" + entry.getValue() + "'"); + } + if (entry.getKey().contains("=") || entry.getValue().contains("=")) { + throw new IllegalArgumentException("Environment variables cannot contain '=': " + + "'" + entry.getKey() + "' = '" + entry.getValue() + "'"); + } + if (sb.length() > 0) { + sb.append(" "); + } + sb.append(entry.getKey()).append("=").append(entry.getValue()); + } + job.getConfiguration().set(ADDENVIRONMENT_PARAM, sb.toString()); + } + + public static void addEnvironment(Map env, Configuration conf) { + String nameVals = conf.get(ADDENVIRONMENT_PARAM); // encoding "a=b c=d" from StreamJob if (nameVals == null) { return; @@ -264,7 +292,7 @@ void addEnvironment(Map env, String nameVals) { for (int i = 0; i < nv.length; i++) { String[] pair = nv[i].split("=", 2); if (pair.length != 2) { - LOG.info("Skip env entry:" + nv[i]); + throw new IllegalArgumentException("Invalid name=value: " + nv[i]); } else { env.put(pair[0], pair[1]); } @@ -319,6 +347,9 @@ private class MRErrorThread extends Thread { private final String reporterPrefix; private final String counterPrefix; private final String statusPrefix; + private final LinkedList stderrBuffer = new LinkedList<>(); + private int stderrBufferSize = 0; + private static final int STDERR_BUFFER_CAPACITY = 10 * 1024; MRErrorThread(Context context) { this.context = context; @@ -345,6 +376,12 @@ public void run() { LOG.warn("Cannot parse reporter line: " + lineStr); } } else { + // Store STDERR in a circular buffer (just the last 10KB), and include it in case of exception + stderrBuffer.add(lineStr); + stderrBufferSize += lineStr.length(); + while (stderrBufferSize > STDERR_BUFFER_CAPACITY && stderrBuffer.size() > 3) { + stderrBufferSize -= stderrBuffer.remove().length(); + } LOG.info("[STDERR] - " + lineStr); // System.err.println(lineStr); } From 7558a26ee5c424efb5876623b9554c3e11721a8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 10 Oct 2024 12:47:47 +0100 Subject: [PATCH 03/66] storage: Add satus details when throwing exceptions. #TASK-6722 --- .../variant/mr/StreamVariantMapper.java | 52 +++++++++++++++---- 1 file changed, 43 insertions(+), 9 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index df5425e8d4d..f2c6e4c1d94 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -117,13 +117,24 @@ public void run(Context context) throws IOException, InterruptedException { map(context.getCurrentKey(), context.getCurrentValue(), context); } while (!hasExceptions() && context.nextKeyValue()); } catch (Throwable th) { - setException(th); + Object currentKey = context.getCurrentKey(); + if (currentKey != null) { + String keyStr; + if (currentKey instanceof ImmutableBytesWritable) { + keyStr = Bytes.toStringBinary(((ImmutableBytesWritable) currentKey).get()); + } else { + keyStr = currentKey.toString(); + } + addException("Exception in mapper for key: " + keyStr, th); + } else { + addException(th); + } } try { // Always call cleanup, even if there was an exception cleanup(context); } catch (Throwable th) { - setException(th); + addException(th); } } else { context.getCounter(COUNTER_GROUP_NAME, "EMPTY_INPUT_SPLIT").increment(1); @@ -135,9 +146,30 @@ private boolean hasExceptions() { return !throwables.isEmpty(); } - private void setException(Throwable th) { + private void addException(String message, Throwable th) { + addException(new Exception(message, th)); + } + + private void addException(Throwable th) { throwables.add(th); LOG.warn("{}", th); + if (th instanceof OutOfMemoryError) { + try { + // Print the current memory status in multiple lines + Runtime runtime = Runtime.getRuntime(); + LOG.warn("Catch OutOfMemoryError!"); + LOG.warn("Free memory: " + runtime.freeMemory()); + LOG.warn("Total memory: " + runtime.totalMemory()); + LOG.warn("Max memory: " + runtime.maxMemory()); + th.addSuppressed(new Exception( + "Free memory: " + runtime.freeMemory() + ", " + + "Total memory: " + runtime.totalMemory() + ", " + + "Max memory: " + runtime.maxMemory())); + } catch (Throwable t) { + // Ignore any exception while printing the memory status + LOG.warn("Error printing memory status", t); + } + } } private void throwExceptionIfAny() throws IOException { @@ -201,7 +233,7 @@ private void closeProcess(Context context) throws IOException, InterruptedExcept process = null; } } catch (Throwable th) { - setException(th); + addException(th); } try { @@ -211,7 +243,7 @@ private void closeProcess(Context context) throws IOException, InterruptedExcept stdout = null; } } catch (Throwable th) { - setException(th); + addException(th); } try { if (stderr != null) { @@ -220,7 +252,7 @@ private void closeProcess(Context context) throws IOException, InterruptedExcept stderr = null; } } catch (Throwable th) { - setException(th); + addException(th); } // drainStdout(context); } @@ -231,6 +263,9 @@ private void startProcess(Context context) throws IOException { // Start the process ProcessBuilder builder = new ProcessBuilder("bash", "-ce", commandLine); +// System.getenv().forEach((k, v) -> LOG.info("SYSTEM ENV: " + k + "=" + v)); +// builder.environment().forEach((k, v) -> LOG.info("ProcessBuilder ENV: " + k + "=" + v)); +// envs.forEach((k, v) -> LOG.info("Config ENV: " + k + "=" + v)); builder.environment().putAll(envs); process = builder.start(); @@ -334,7 +369,7 @@ public void run() { } } } catch (Throwable th) { - setException(th); + addException(th); } } } @@ -383,7 +418,6 @@ public void run() { stderrBufferSize -= stderrBuffer.remove().length(); } LOG.info("[STDERR] - " + lineStr); -// System.err.println(lineStr); } long now = System.currentTimeMillis(); if (now - lastStderrReport > REPORTER_ERR_DELAY) { @@ -393,7 +427,7 @@ public void run() { line.clear(); } } catch (Throwable th) { - setException(th); + addException(th); } } From bc7c6ae8b65db767e57f7771cc6f3f276af25d40 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 11 Oct 2024 08:57:08 +0100 Subject: [PATCH 04/66] storage: Fix walker output file name #TASK-6722 --- .../opencb/opencga/analysis/variant/VariantWalkerTool.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java index a3eddd7eefe..56e008e0daf 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java @@ -53,13 +53,13 @@ protected void check() throws Exception { toolParams.setFileFormat(VariantWriterFactory.VariantOutputFormat.VCF.toString()); } - format = VariantWriterFactory.toOutputFormat(toolParams.getOutputFileName(), toolParams.getOutputFileName()); + format = VariantWriterFactory.toOutputFormat(toolParams.getFileFormat(), toolParams.getOutputFileName()); if (!format.isPlain()) { format = format.inPlain(); } if (StringUtils.isEmpty(toolParams.getOutputFileName())) { - toolParams.setOutputFileName("output." + format.toString().toLowerCase() + ".gz"); + toolParams.setOutputFileName("output.txt.gz"); } else if (!toolParams.getOutputFileName().endsWith(".gz")) { toolParams.setOutputFileName(toolParams.getOutputFileName() + ".gz"); } From ab4dff5b076cb29e684024f593c5084f5ed7cebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 15 Oct 2024 16:14:02 +0100 Subject: [PATCH 05/66] storage: Properly configure task java heap #TASK-6722 --- .../opencb/opencga/core/common/IOUtils.java | 15 ++-- .../core/variant/VariantStorageOptions.java | 2 +- .../main/resources/storage-configuration.yml | 18 ++-- .../hadoop/utils/AbstractHBaseDriver.java | 11 ++- .../variant/HadoopVariantStorageEngine.java | 14 +-- .../variant/HadoopVariantStorageOptions.java | 11 +++ .../variant/mr/StreamVariantMapper.java | 31 +++++-- .../variant/mr/VariantMapReduceUtil.java | 90 +++++++++++++++++++ .../src/main/python/variant_walker.py | 37 ++++---- 9 files changed, 174 insertions(+), 55 deletions(-) diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java b/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java index e37374e76ea..eb0cdeaf29e 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java @@ -389,15 +389,16 @@ public static long fromHumanReadableToByte(String value, boolean assumeBinary) { if (value.endsWith("B")) { value = value.substring(0, value.length() - 1); } - boolean si; - if (value.endsWith("i")) { - si = false; - value = value.substring(0, value.length() - 1); - } else { - si = true; - } + final boolean si; if (assumeBinary) { si = false; + } else { + if (value.endsWith("i")) { + si = false; + value = value.substring(0, value.length() - 1); + } else { + si = true; + } } int unit = si ? 1000 : 1024; int exp = "KMGTPE".indexOf(value.toUpperCase().charAt(value.length() - 1)) + 1; diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java index b00bd525bd8..c8d8cf63cef 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java @@ -100,7 +100,7 @@ public enum VariantStorageOptions implements ConfigurationOption { QUERY_SAMPLE_LIMIT_DEFAULT("query.sample.limit.default", 100), QUERY_SAMPLE_LIMIT_MAX("query.sample.limit.max", 1000), - WALKER_DOCKER_MEMORY("walker.docker.memory", "512m", true), + WALKER_DOCKER_MEMORY("walker.docker.memory", "1024m", true), WALKER_DOCKER_CPU("walker.docker.cpu", "1", true), WALKER_DOCKER_USER("walker.docker.user", "", true), WALKER_DOCKER_ENV("walker.docker.env", "", true), diff --git a/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml b/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml index 21c2dd12f79..f422770d9b1 100644 --- a/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml +++ b/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml @@ -128,12 +128,12 @@ variant: search.intersect.always: false # Force intersect queries search.intersect.params.threshold: 3 # Minimum number of QueryParams in the query to intersect - walker.docker.memory: "512m" # Memory limit for the docker executor - walker.docker.cpu: "1" # CPU limit for the docker executor - walker.docker.user: "" # User to run the docker executor - walker.docker.env: "" # Environment variables to be passed to the docker executor. e.g. key=value,key2=value2 - walker.docker.mount: "" # Volumes to be mounted in the docker executor - walker.docker.opts: "" # Additional docker options + walker.docker.memory: "1024m" # Memory limit for the docker executor +# walker.docker.cpu: "1" # CPU limit for the docker executor +# walker.docker.user: "" # User to run the docker executor +# walker.docker.env: "" # Environment variables to be passed to the docker executor. e.g. key=value,key2=value2 +# walker.docker.mount: "" # Volumes to be mounted in the docker executor +# walker.docker.opts: "" # Additional docker options ## The following section defines all available storage engine plugins installed engines: @@ -191,16 +191,16 @@ variant: DeleteHBaseColumnDriver: storage.hadoop.write.mappers.limit.factor: 4 DiscoverPendingVariantsDriver: - mapreduce.map.memory.mb: 750 + mapreduce.map.memory.mb: 2048 VariantStatsDriver: mapreduce.map.memory.mb: 2048 + StreamVariantDriver: + mapreduce.map.memory.mb: 3072 SampleIndexDriver: mapreduce.map.memory.mb: 4096 max-columns-per-scan: 8000 SampleIndexAnnotationLoaderDriver: mapreduce.map.memory.mb: 4096 - VariantMigration200Driver: - mapreduce.map.memory.mb: 1024 ## PENDING diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index 4e9fe7057a8..4a15198f102 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -11,6 +11,7 @@ import org.apache.hadoop.hbase.mapreduce.TableInputFormat; import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; @@ -80,6 +81,7 @@ private Job newJob() throws IOException { addJobConf(job, MRJobConfig.JOB_RUNNING_MAP_LIMIT); addJobConf(job, MRJobConfig.JOB_RUNNING_REDUCE_LIMIT); addJobConf(job, MRJobConfig.TASK_TIMEOUT); + VariantMapReduceUtil.configureTaskJavaHeap(((JobConf) job.getConfiguration()), getClass()); return job; } @@ -171,10 +173,15 @@ public final int run(String[] args) throws Exception { } else { LOGGER.info(" * Mapper : " + job.getMapperClass().getName()); } - LOGGER.info(" - memory (MB) : " + job.getConfiguration().getInt(MRJobConfig.MAP_MEMORY_MB, -1)); + JobConf jobConf = (JobConf) job.getConfiguration(); + LOGGER.info(" - memory required (MB) : " + jobConf.getMemoryRequired(TaskType.MAP)); + LOGGER.info(" - java-heap (MB) : " + JobConf.parseMaximumHeapSizeMB(jobConf.getTaskJavaOpts(TaskType.MAP))); + LOGGER.info(" - java-opts : " + jobConf.getTaskJavaOpts(TaskType.MAP)); if (job.getNumReduceTasks() > 0) { LOGGER.info(" * Reducer : " + job.getNumReduceTasks() + "x " + job.getReducerClass().getName()); - LOGGER.info(" - memory (MB) : " + job.getConfiguration().getInt(MRJobConfig.REDUCE_MEMORY_MB, -1)); + LOGGER.info(" - memory required (MB) : " + jobConf.getMemoryRequired(TaskType.REDUCE)); + LOGGER.info(" - java-heap (MB) : " + JobConf.parseMaximumHeapSizeMB(jobConf.getTaskJavaOpts(TaskType.REDUCE))); + LOGGER.info(" - java-opts : " + jobConf.getTaskJavaOpts(TaskType.REDUCE)); } else { LOGGER.info(" * Reducer : (no reducer)"); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java index 061bc956427..8598407233d 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java @@ -324,15 +324,8 @@ public URI walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat for ObjectMap params = new ObjectMap(getOptions()).appendAll(variantQuery.getQuery()).appendAll(variantQuery.getInputOptions()); params.remove(StreamVariantDriver.COMMAND_LINE_PARAM); - String memory = getOptions().getString(WALKER_DOCKER_MEMORY.key(), WALKER_DOCKER_MEMORY.defaultValue()); - int memoryBytes; - if (memory.endsWith("M") || memory.endsWith("m")) { - memoryBytes = Integer.parseInt(memory.substring(0, memory.length() - 1)) * 1024 * 1024; - } else if (memory.endsWith("G") || memory.endsWith("g")) { - memoryBytes = Integer.parseInt(memory.substring(0, memory.length() - 1)) * 1024 * 1024 * 1024; - } else { - memoryBytes = Integer.parseInt(memory); - } + String dockerMemory = getOptions().getString(WALKER_DOCKER_MEMORY.key(), WALKER_DOCKER_MEMORY.defaultValue()); + long dockerMemoryBytes = IOUtils.fromHumanReadableToByte(dockerMemory, true); String dockerHost = getOptions().getString(MR_STREAM_DOCKER_HOST.key(), MR_STREAM_DOCKER_HOST.defaultValue()); if (StringUtils.isNotEmpty(dockerHost)) { @@ -343,7 +336,8 @@ public URI walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat for null, getVariantTableName(), studyId, null, params - .append(StreamVariantDriver.MAX_BYTES_PER_MAP_PARAM, memoryBytes / 2) + .append(MR_HEAP_MAP_OTHER_MB.key(), dockerMemoryBytes / 1024 / 1204) + .append(StreamVariantDriver.MAX_BYTES_PER_MAP_PARAM, dockerMemoryBytes / 2) .append(StreamVariantDriver.COMMAND_LINE_BASE64_PARAM, Base64.getEncoder().encodeToString(commandLine.getBytes())) .append(StreamVariantDriver.INPUT_FORMAT_PARAM, format.toString()) .append(StreamVariantDriver.OUTPUT_PARAM, outputFile) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java index 363b07e9fbc..268caaf9253 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java @@ -61,6 +61,17 @@ public enum HadoopVariantStorageOptions implements ConfigurationOption { MR_EXECUTOR_SSH_HADOOP_TERMINATION_GRACE_PERIOD_SECONDS("storage.hadoop.mr.executor.ssh.terminationGracePeriodSeconds", 120), MR_STREAM_DOCKER_HOST("storage.hadoop.mr.stream.docker.host", "", true), + MR_HEAP_MIN_MB("storage.hadoop.mr.heap.min-mb", 512), // Min heap size for the JVM + MR_HEAP_MAX_MB("storage.hadoop.mr.heap.max-mb", 2048), // Max heap size for the JVM + MR_HEAP_MAP_OTHER_MB("storage.hadoop.mr.heap.map.other-mb", 0), // Other reserved memory. Not used by the JVM heap. + MR_HEAP_REDUCE_OTHER_MB("storage.hadoop.mr.heap.reduce.other-mb", 0), // Other reserved memory. Not used by the JVM heap. + MR_HEAP_MEMORY_MB_RATIO("storage.hadoop.mr.heap.memory-mb.ratio", 0.6), // Ratio of the memory to use for the JVM heap. + // Heap size for the map and reduce tasks. + // If not set, it will be calculated as: + // (REQUIRED_MEMORY - MR_HEAP_OTHER_MB) * MR_HEAP_MEMORY_MB_RATIO + // then caped between MR_HEAP_MIN_MB and MR_HEAP_MAX_MB + MR_HEAP_MAP_MB("storage.hadoop.mr.heap.map.mb"), + MR_HEAP_REDUCE_MB("storage.hadoop.mr.heap.reduce.mb"), ///////////////////////// // Variant table configuration diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index f2c6e4c1d94..15233484298 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -147,7 +147,14 @@ private boolean hasExceptions() { } private void addException(String message, Throwable th) { - addException(new Exception(message, th)); + th.addSuppressed(new AnnotationException(message)); + addException(th); + } + + public static class AnnotationException extends RuntimeException { + public AnnotationException(String message) { + super(message); + } } private void addException(Throwable th) { @@ -156,15 +163,21 @@ private void addException(Throwable th) { if (th instanceof OutOfMemoryError) { try { // Print the current memory status in multiple lines - Runtime runtime = Runtime.getRuntime(); + Runtime rt = Runtime.getRuntime(); LOG.warn("Catch OutOfMemoryError!"); - LOG.warn("Free memory: " + runtime.freeMemory()); - LOG.warn("Total memory: " + runtime.totalMemory()); - LOG.warn("Max memory: " + runtime.maxMemory()); - th.addSuppressed(new Exception( - "Free memory: " + runtime.freeMemory() + ", " - + "Total memory: " + runtime.totalMemory() + ", " - + "Max memory: " + runtime.maxMemory())); + LOG.warn("Free memory: " + rt.freeMemory()); + LOG.warn("Total memory: " + rt.totalMemory()); + LOG.warn("Max memory: " + rt.maxMemory()); + + double mb = 1024 * 1024; + th.addSuppressed(new AnnotationException(String.format("Memory usage. MaxMemory: %.2f MiB" + + " TotalMemory: %.2f MiB" + + " FreeMemory: %.2f MiB" + + " UsedMemory: %.2f MiB", + rt.maxMemory() / mb, + rt.totalMemory() / mb, + rt.freeMemory() / mb, + (rt.totalMemory() - rt.freeMemory()) / mb))); } catch (Throwable t) { // Ignore any exception while printing the memory status LOG.warn("Error printing memory status", t); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java index 367196742db..0c3ab30a697 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java @@ -15,10 +15,12 @@ import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; import org.apache.hadoop.hbase.mapreduce.TableMapper; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobContext; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.TaskType; import org.apache.phoenix.mapreduce.util.PhoenixMapReduceUtil; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; @@ -51,6 +53,7 @@ import java.util.Collection; import java.util.Collections; import java.util.List; +import java.util.regex.Pattern; /** * Created on 27/10/17. @@ -60,6 +63,8 @@ public class VariantMapReduceUtil { private static final Logger LOGGER = LoggerFactory.getLogger(VariantMapReduceUtil.class); + private static final Pattern JAVA_OPTS_XMX_PATTERN = + Pattern.compile(".*(?:^|\\s)-Xmx(\\d+)([gGmMkK]?)(?:$|\\s).*"); public static void initTableMapperJob(Job job, String inTable, String outTable, Scan scan, Class mapperClass) @@ -609,6 +614,26 @@ public static String getParam(Configuration conf, String key, String defaultValu return getParam(conf, key, defaultValue, null); } + private static String getParam(JobConf conf, TaskType taskType, Class clazz, + HadoopVariantStorageOptions mapKey, HadoopVariantStorageOptions reduceKey) { + final String value; + switch (taskType) { + case MAP: + value = getParam(conf, mapKey, clazz); + break; + case REDUCE: + value = getParam(conf, reduceKey, clazz); + break; + default: + throw new IllegalArgumentException("Unexpected task type " + taskType); + } + return value; + } + + public static String getParam(Configuration conf, ConfigurationOption key, Class aClass) { + return getParam(conf, key.key(), key.defaultValue() == null ? null : key.defaultValue().toString(), aClass); + } + /** * Reads a param that might come in different forms. It will take the the first value in this order: * - "--{key}" @@ -641,4 +666,69 @@ public static String getParam(Configuration conf, String key, String defaultValu } return value; } + + public static void configureTaskJavaHeap(JobConf conf, Class clazz) { + configureTaskJavaHeap(conf, TaskType.MAP, clazz); + configureTaskJavaHeap(conf, TaskType.REDUCE, clazz); + } + + public static void configureTaskJavaHeap(JobConf conf, TaskType taskType, Class clazz) { + int memoryRequired = conf.getMemoryRequired(taskType); + String heapStr = getParam(conf, taskType, clazz, + HadoopVariantStorageOptions.MR_HEAP_MAP_MB, + HadoopVariantStorageOptions.MR_HEAP_REDUCE_MB); + + int heap; + if (heapStr != null) { + heap = Integer.parseInt(heapStr); + } else { + int minHeap = Integer.parseInt(getParam(conf, HadoopVariantStorageOptions.MR_HEAP_MIN_MB, clazz)); + int maxHeap = Integer.parseInt(getParam(conf, HadoopVariantStorageOptions.MR_HEAP_MAX_MB, clazz)); + double ratio = Double.parseDouble(getParam(conf, HadoopVariantStorageOptions.MR_HEAP_MEMORY_MB_RATIO, clazz)); + int other = Integer.parseInt(getParam(conf, taskType, clazz, + HadoopVariantStorageOptions.MR_HEAP_MAP_OTHER_MB, + HadoopVariantStorageOptions.MR_HEAP_REDUCE_OTHER_MB)); + + heap = (int) Math.round((memoryRequired - other) * ratio); + heap = Math.max(minHeap, heap); + heap = Math.min(maxHeap, heap); + } + setTaskJavaHeap(conf, taskType, heap); + } + + public static void setTaskJavaHeap(Configuration conf, TaskType taskType, int javaHeapMB) { + String javaOpts = getTaskJavaOpts(conf, taskType); + String xmx = " -Xmx" + javaHeapMB + "m"; + if (javaOpts == null) { + javaOpts = xmx; + } else if (javaOpts.contains("-Xmx")) { + javaOpts = JAVA_OPTS_XMX_PATTERN.matcher(javaOpts).replaceFirst(xmx); + } else { + javaOpts += xmx; + } + switch (taskType) { + case MAP: + conf.set(JobConf.MAPRED_MAP_TASK_JAVA_OPTS, javaOpts); + break; + case REDUCE: + conf.set(JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS, javaOpts); + break; + default: + throw new IllegalArgumentException("Unexpected task type " + taskType); + } + } + + public static String getTaskJavaOpts(Configuration conf, TaskType taskType) { + switch (taskType) { + case MAP: + return conf.get(JobConf.MAPRED_MAP_TASK_JAVA_OPTS, + conf.get(JobConf.MAPRED_TASK_JAVA_OPTS, JobConf.DEFAULT_MAPRED_TASK_JAVA_OPTS)); + case REDUCE: + return conf.get(JobConf.MAPRED_REDUCE_TASK_JAVA_OPTS, + conf.get(JobConf.MAPRED_TASK_JAVA_OPTS, JobConf.DEFAULT_MAPRED_TASK_JAVA_OPTS)); + default: + throw new IllegalArgumentException("Unexpected task type " + taskType); + } + } + } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/variant_walker.py b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/variant_walker.py index 4d56e92c45c..fa3ea798e5d 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/variant_walker.py +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/variant_walker.py @@ -60,15 +60,15 @@ def write(self, value): """ print(value) - def jsonHeaderToVcfHeader(self, jsonHeader): - """ - Convert a JSON header to a VCF header. - - Args: - jsonHeader (dict): The JSON header to convert. - """ - # TODO: Implement this method - return "" + # def jsonHeaderToVcfHeader(self, jsonHeader): + # """ + # Convert a JSON header to a VCF header. + # + # Args: + # jsonHeader (dict): The JSON header to convert. + # """ + # # TODO: Implement this method + # return "" def getTmpdir(self): @@ -130,16 +130,19 @@ def main(module_name, class_name, *args): if not header_read: if line.startswith("#") or num_entries == 1: header.append(line) - continue else: header_read = True - walker.header(header) - - try: - walker.map(line) - except Exception as e: - print(f"An error occurred while processing the line: {e}", file=sys.stderr) - raise + try: + walker.header(header) + except Exception as e: + print(f"An error occurred while processing the header: {e}", file=sys.stderr) + raise + else: + try: + walker.map(line) + except Exception as e: + print(f"An error occurred while processing a line: {e}", file=sys.stderr) + raise walker.count("num_entries", num_entries) walker.count("size_entries", size_entries) From 7af802034b302c68bf215e552d730d65a1685371 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 16 Oct 2024 15:20:12 +0100 Subject: [PATCH 06/66] storage: Run docker image prune on cleanup. #TASK-6722 --- .../variant/mr/StreamVariantMapper.java | 51 ++++++++++++++++++- .../VariantHadoopStoragePipelineTest.java | 2 +- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 15233484298..e02ae93c489 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -1,5 +1,7 @@ package org.opencb.opencga.storage.hadoop.variant.mr; +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -10,9 +12,13 @@ import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.util.LineReader; import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.metadata.VariantMetadata; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.commons.exec.Command; import org.opencb.commons.io.DataWriter; +import org.opencb.opencga.storage.core.exceptions.StorageEngineException; +import org.opencb.opencga.storage.core.metadata.VariantMetadataFactory; import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseVariantStorageMetadataDBAdaptorFactory; @@ -62,6 +68,7 @@ public class StreamVariantMapper extends VariantMapper throwables = Collections.synchronizedList(new ArrayList<>()); private volatile boolean processProvidedStatus_ = false; + private VariantMetadata metadata; public static void setCommandLine(Job job, String commandLine) { String commandLineBase64 = Base64.getEncoder().encodeToString(commandLine.getBytes()); @@ -125,7 +132,16 @@ public void run(Context context) throws IOException, InterruptedException { } else { keyStr = currentKey.toString(); } - addException("Exception in mapper for key: " + keyStr, th); + String message = "Exception in mapper for key: '" + keyStr + "'"; + try { + Variant currentValue = context.getCurrentValue(); + if (currentValue != null) { + message += " value: '" + currentValue + "'"; + } + } catch (Throwable t) { + th.addSuppressed(t); + } + addException(message, th); } else { addException(th); } @@ -210,9 +226,30 @@ private void throwExceptionIfAny() throws IOException { @Override protected void cleanup(Mapper.Context context) throws IOException, InterruptedException { closeProcess(context); + dockerPruneImages(); super.cleanup(context); } + private void dockerPruneImages() { + try { + LOG.info("Pruning docker images"); + int maxImages = 5; + Command command = new Command(new String[]{"bash", "-c", "[ $(docker image ls --format json | wc -l) -gt " + maxImages + " ] " + + "&& echo 'Run docker image prune' && docker image prune -f -a " + + "|| echo 'Skipping docker image prune. Less than " + maxImages + " images.'"}, Collections.emptyMap()); + command.run(); + int ecode = command.getExitValue(); + + // Throw exception if the process failed + if (ecode != 0) { + throw new IOException("Error executing 'docker image prune -f -a'. Exit code: " + ecode); + } + LOG.info("Docker images pruned"); + } catch (IOException e) { + addException(e); + } + } + @Override protected void map(Object key, Variant value, Context context) throws IOException, InterruptedException { numRecordsRead++; @@ -270,7 +307,7 @@ private void closeProcess(Context context) throws IOException, InterruptedExcept // drainStdout(context); } - private void startProcess(Context context) throws IOException { + private void startProcess(Context context) throws IOException, StorageEngineException { LOG.info("bash -ce '" + commandLine + "'"); context.getCounter(COUNTER_GROUP_NAME, "START_PROCESS").increment(1); @@ -298,6 +335,16 @@ private void startProcess(Context context) throws IOException { variantDataWriter = writerFactory.newDataWriter(format, stdin, new Query(query), new QueryOptions(options)); + + if (format.inPlain() == VariantWriterFactory.VariantOutputFormat.JSON) { + if (metadata == null) { + VariantMetadataFactory metadataFactory = new VariantMetadataFactory(metadataManager); + metadata = metadataFactory.makeVariantMetadata(query, options); + } + ObjectMapper objectMapper = new ObjectMapper().configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); + objectMapper.writeValue((DataOutput) stdin, metadata); + } + processedBytes = 0; numRecordsRead = 0; numRecordsWritten = 0; diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHadoopStoragePipelineTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHadoopStoragePipelineTest.java index 33b67fb5b12..ac01326b1cc 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHadoopStoragePipelineTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHadoopStoragePipelineTest.java @@ -329,7 +329,7 @@ public void exportCommand() throws Exception { // variantStorageEngine.walkData(outdir.resolve("variant6.txt.gz"), VariantWriterFactory.VariantOutputFormat.VCF, new Query(), new QueryOptions(), cmdPython); // variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), "opencb/opencga-base", cmd); // variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), "opencb/opencga-base", cmdPython1); - variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), "my-python-app:latest", cmdPython1); + variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), "jcoll/my-python-app:latest", cmdPython1); } From c5375ea45b33154e630180ff9ec38625e93971e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 24 Oct 2024 18:58:59 +0100 Subject: [PATCH 07/66] storage: Ensure walker output is sorted. #TASK-6722 --- .../storage/hadoop/utils/HBaseManager.java | 4 +-- .../hadoop/variant/io/VariantDriver.java | 6 ++-- .../variant/mr/StreamVariantDriver.java | 10 +++--- .../variant/mr/StreamVariantMapper.java | 24 +++++++++++--- .../variant/mr/StreamVariantReducer.java | 33 +++++++++++++++++++ 5 files changed, 65 insertions(+), 12 deletions(-) create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/HBaseManager.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/HBaseManager.java index 1f6cd77efd8..2074a30f89f 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/HBaseManager.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/HBaseManager.java @@ -410,8 +410,8 @@ public boolean splitAndMove(Admin admin, TableName tableName, byte[] expectedSpl LOGGER.info("Splitting table '{}' at '{}'", tableName, Bytes.toStringBinary(expectedSplit)); admin.split(tableName, expectedSplit); regionInfo = getRegionInfo(admin, tableName, expectedSplit); - int getRegionInfoAttempts = 10; - while (regionInfo == null) { + int getRegionInfoAttempts = 20; + while (regionInfo == null || regionInfo.isOffline()) { try { Thread.sleep(200); } catch (InterruptedException e) { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java index 755275263a7..10968dd2110 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java @@ -4,6 +4,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.opencb.commons.datastore.core.Query; @@ -92,7 +93,7 @@ protected void parseAndValidateParameters() throws IOException { protected abstract Class getReducerClass(); - protected abstract Class getOutputFormatClass(); + protected abstract Class getOutputFormatClass(); protected abstract void setupJob(Job job) throws IOException; @@ -109,7 +110,7 @@ protected final Job setupJob(Job job, String archiveTable, String variantTable) throw new IllegalArgumentException("Reducer class not provided!"); } } - Class outputFormatClass = getOutputFormatClass(); + Class outputFormatClass = getOutputFormatClass(); if (outputFormatClass == null) { throw new IllegalArgumentException("Output format class not provided!"); } @@ -118,6 +119,7 @@ protected final Job setupJob(Job job, String archiveTable, String variantTable) if (useReduceStep) { logger.info("Use one Reduce task to produce a single file"); job.setReducerClass(reducerClass); + // TODO: Configure multiple reducers and partitioner job.setNumReduceTasks(1); } else { VariantMapReduceUtil.setNoneReduce(job); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java index 5a248e190e1..d1c2e73ad4d 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java @@ -8,8 +8,10 @@ import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.JobContext; import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; @@ -39,7 +41,7 @@ public class StreamVariantDriver extends VariantDriver { private Class mapperClass; private Class reducerClass; - private Class outputFormatClass; + private Class outputFormatClass; @Override protected Map getParams() { @@ -110,7 +112,7 @@ protected Class getReducerClass() { } @Override - protected Class getOutputFormatClass() { + protected Class getOutputFormatClass() { return outputFormatClass; } @@ -134,7 +136,7 @@ protected void setupJob(Job job) throws IOException { StreamVariantMapper.setMaxInputBytesPerProcess(job, maxBytesPerMap); StreamVariantMapper.setEnvironment(job, envVars); - reducerClass = Reducer.class; + reducerClass = StreamVariantReducer.class; outputFormatClass = ValueOnlyTextOutputFormat.class; job.setOutputFormatClass(ValueOnlyTextOutputFormat.class); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index e02ae93c489..09845a8a617 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -53,6 +53,7 @@ public class StreamVariantMapper extends VariantMapper 01 + // 3 -> 03 + // 22 -> 22 + // If the first character is a digit, and the second is not, add a 0 at the beginning + // MT -> MT + // 1_KI270712v1_random -> 01_KI270712v1_random + if (Character.isDigit(chromosome.charAt(0)) && (chromosome.length() == 1 || !Character.isDigit(chromosome.charAt(1)))) { + chromosome = "0" + chromosome; + } + + outputKeyPrefix = String.format("%s|%010d|", chromosome, variant.getStart()); + outputKeyNum = 0; } @Override public void run(Context context) throws IOException, InterruptedException { if (context.nextKeyValue()) { + Variant currentValue = null; try { setup(context); startProcess(context); @@ -121,7 +137,8 @@ public void run(Context context) throws IOException, InterruptedException { closeProcess(context); startProcess(context); } - map(context.getCurrentKey(), context.getCurrentValue(), context); + currentValue = context.getCurrentValue(); + map(context.getCurrentKey(), currentValue, context); } while (!hasExceptions() && context.nextKeyValue()); } catch (Throwable th) { Object currentKey = context.getCurrentKey(); @@ -134,7 +151,6 @@ public void run(Context context) throws IOException, InterruptedException { } String message = "Exception in mapper for key: '" + keyStr + "'"; try { - Variant currentValue = context.getCurrentValue(); if (currentValue != null) { message += " value: '" + currentValue + "'"; } @@ -410,7 +426,7 @@ public void run() { LineReader stdoutLineReader = new LineReader(stdout); try { while (stdoutLineReader.readLine(line) > 0) { - context.write(new ImmutableBytesWritable(Bytes.toBytes(outputKeyNum++)), line); + context.write(new ImmutableBytesWritable(Bytes.toBytes(outputKeyPrefix + (outputKeyNum++))), line); // context.write(null, line); if (verboseStdout) { LOG.info("[STDOUT] - " + line); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java new file mode 100644 index 00000000000..dac1b5dae4e --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java @@ -0,0 +1,33 @@ +package org.opencb.opencga.storage.hadoop.variant.mr; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Reducer; + +import java.io.IOException; + +public class StreamVariantReducer extends Reducer { + private static final Log LOG = LogFactory.getLog(StreamVariantReducer.class); + + @Override + protected void setup(Reducer.Context context) throws IOException, InterruptedException { + super.setup(context); + } + + @Override + protected void reduce(ImmutableBytesWritable key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { + + for (Text value : values) { + context.write(key, value); + context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stdout_records").increment(1); + } + + } + + @Override + protected void cleanup(Reducer.Context context) throws IOException, InterruptedException { + super.cleanup(context); + } +} From 663c03ae91df3276816e253ad39dd8d5af6bda20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 25 Oct 2024 10:09:19 +0100 Subject: [PATCH 08/66] storage: Extract walker STDERR file from MR execution. #TASK-6722 --- .../hadoop/utils/AbstractHBaseDriver.java | 28 +++++----- .../VariantTableAggregationDriver.java | 2 +- .../variant/executors/SshMRExecutor.java | 56 +++++++++++-------- .../hadoop/variant/io/VariantDriver.java | 20 ++++--- .../variant/mr/StreamVariantDriver.java | 16 +++++- .../variant/mr/StreamVariantMapper.java | 52 +++++++++++++++-- .../variant/mr/StreamVariantReducer.java | 26 +++++++-- .../variant/mr/StreamVariantMapperTest.java | 41 ++++++++++++++ 8 files changed, 183 insertions(+), 58 deletions(-) create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapperTest.java diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index 4a15198f102..3773141eefd 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -435,7 +435,7 @@ public Path getOutdir() { * @return List of copied files from HDFS */ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput) throws IOException { - return concatMrOutputToLocal(mrOutdir, localOutput, true); + return concatMrOutputToLocal(mrOutdir, localOutput, true, null); } /** @@ -444,10 +444,12 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput) thro * @param mrOutdir MapReduce output directory * @param localOutput Local file * @param removeExtraHeaders Remove header lines starting with "#" from all files but the first + * @param partFilePrefix Filter partial files with specific prefix. Otherwise, concat them all. * @throws IOException on IOException * @return List of copied files from HDFS */ - protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, boolean removeExtraHeaders) throws IOException { + protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, boolean removeExtraHeaders, String partFilePrefix) + throws IOException { // TODO: Allow copy output to any IOConnector FileSystem fileSystem = mrOutdir.getFileSystem(getConf()); RemoteIterator it = fileSystem.listFiles(mrOutdir, false); @@ -461,10 +463,12 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool && !path.getName().equals(ParquetFileWriter.PARQUET_METADATA_FILE) && !path.getName().equals(ParquetFileWriter.PARQUET_COMMON_METADATA_FILE) && status.getLen() > 0) { - paths.add(path); + if (partFilePrefix == null || path.getName().startsWith(partFilePrefix)) { + paths.add(path); + } } } - if (paths.size() == 0) { + if (paths.isEmpty()) { LOGGER.warn("The MapReduce job didn't produce any output. This may not be expected."); } else if (paths.size() == 1) { LOGGER.info("Copy to local file " + paths.get(0).toUri() + " to " + localOutput.toUri()); @@ -475,17 +479,15 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool LOGGER.info(" Source : " + mrOutdir.toUri()); LOGGER.info(" Target : " + localOutput.toUri()); LOGGER.info(" ---- "); - try (FSDataOutputStream fsOs = localOutput.getFileSystem(getConf()).create(localOutput)) { - boolean isGzip = paths.get(0).getName().endsWith(".gz"); - OutputStream os; - if (isGzip) { - os = new GZIPOutputStream(fsOs); - } else { - os = fsOs; - } + boolean isGzip = paths.get(0).getName().endsWith(".gz"); + try (FSDataOutputStream fsOs = localOutput.getFileSystem(getConf()).create(localOutput); + OutputStream gzOs = isGzip ? new GZIPOutputStream(fsOs) : null) { + OutputStream os = gzOs == null ? fsOs : gzOs; for (int i = 0; i < paths.size(); i++) { Path path = paths.get(i); - LOGGER.info("Concat file : '{}' {} ", path.toUri(), + LOGGER.info("Concat {}file : '{}' {} ", + isGzip ? "gzip " : "", + path.toUri(), humanReadableByteCount(fileSystem.getFileStatus(path).getLen(), false)); try (FSDataInputStream fsIs = fileSystem.open(path)) { InputStream is; diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/VariantTableAggregationDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/VariantTableAggregationDriver.java index 374b5526e89..0d471c8387d 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/VariantTableAggregationDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/VariantTableAggregationDriver.java @@ -144,7 +144,7 @@ protected void postExecution(boolean succeed) throws IOException, StorageEngineE super.postExecution(succeed); if (succeed) { if (localOutput != null) { - concatMrOutputToLocal(outdir, localOutput, isOutputWithHeaders()); + concatMrOutputToLocal(outdir, localOutput, isOutputWithHeaders(), null); } } if (localOutput != null) { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java index b205511f830..612f3183a98 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java @@ -106,6 +106,11 @@ public Result run(String executable, String[] args) throws StorageEngineExceptio ObjectMap result = readResult(new String(outputStream.toByteArray(), Charset.defaultCharset())); if (exitValue == 0) { copyOutputFiles(args, env); + for (String key : result.keySet()) { + if (key.startsWith("EXTRA_OUTPUT_")) { + copyOutputFiles(result.getString(key), env); + } + } } return new Result(exitValue, result); } @@ -125,33 +130,38 @@ private Path copyOutputFiles(String[] args, List env) throws StorageEngi List argsList = Arrays.asList(args); int outputIdx = argsList.indexOf("output"); if (outputIdx > 0 && argsList.size() > outputIdx + 1) { - String targetOutput = UriUtils.createUriSafe(argsList.get(outputIdx + 1)).getPath(); - if (StringUtils.isNotEmpty(targetOutput)) { - String remoteOpencgaHome = getOptions().getString(MR_EXECUTOR_SSH_REMOTE_OPENCGA_HOME.key()); - String srcOutput; - if (StringUtils.isNoneEmpty(remoteOpencgaHome, getOpencgaHome())) { - srcOutput = targetOutput.replaceAll(getOpencgaHome(), remoteOpencgaHome); - } else { - srcOutput = targetOutput; - } + return copyOutputFiles(argsList.get(outputIdx + 1), env); + } + // Nothing to copy + return null; + } - String hadoopScpBin = getOptions() - .getString(MR_EXECUTOR_SSH_HADOOP_SCP_BIN.key(), MR_EXECUTOR_SSH_HADOOP_SCP_BIN.defaultValue()); - String commandLine = getBinPath(hadoopScpBin) + " " + srcOutput + " " + targetOutput; + private Path copyOutputFiles(String output, List env) throws StorageEngineException { + String targetOutput = UriUtils.createUriSafe(output).getPath(); + if (StringUtils.isNotEmpty(targetOutput)) { + String remoteOpencgaHome = getOptions().getString(MR_EXECUTOR_SSH_REMOTE_OPENCGA_HOME.key()); + String srcOutput; + if (StringUtils.isNoneEmpty(remoteOpencgaHome, getOpencgaHome())) { + srcOutput = targetOutput.replaceAll(getOpencgaHome(), remoteOpencgaHome); + } else { + srcOutput = targetOutput; + } - Command command = new Command(commandLine, env); - command.run(); - int exitValue = command.getExitValue(); - if (exitValue != 0) { - String sshHost = getOptions().getString(MR_EXECUTOR_SSH_HOST.key()); - String sshUser = getOptions().getString(MR_EXECUTOR_SSH_USER.key()); - throw new StorageEngineException("There was an issue copying files from " - + sshUser + "@" + sshHost + ":" + srcOutput + " to " + targetOutput); - } - return Paths.get(targetOutput); + String hadoopScpBin = getOptions() + .getString(MR_EXECUTOR_SSH_HADOOP_SCP_BIN.key(), MR_EXECUTOR_SSH_HADOOP_SCP_BIN.defaultValue()); + String commandLine = getBinPath(hadoopScpBin) + " " + srcOutput + " " + targetOutput; + + Command command = new Command(commandLine, env); + command.run(); + int exitValue = command.getExitValue(); + if (exitValue != 0) { + String sshHost = getOptions().getString(MR_EXECUTOR_SSH_HOST.key()); + String sshUser = getOptions().getString(MR_EXECUTOR_SSH_USER.key()); + throw new StorageEngineException("There was an issue copying files from " + + sshUser + "@" + sshHost + ":" + srcOutput + " to " + targetOutput); } + return Paths.get(targetOutput); } - // Nothing to copy return null; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java index 10968dd2110..40d178384a0 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java @@ -48,10 +48,10 @@ public abstract class VariantDriver extends AbstractVariantsTableDriver { public static final String OUTPUT_PARAM = "output"; public static final String CONCAT_OUTPUT_PARAM = "concat-output"; - private Path outdir; - private Path localOutput; - private Query query = new Query(); - private QueryOptions options = new QueryOptions(); + protected Path outdir; + protected Path localOutput; + private final Query query = new Query(); + private final QueryOptions options = new QueryOptions(); private static Logger logger = LoggerFactory.getLogger(VariantDriver.class); protected boolean useReduceStep; @@ -166,14 +166,16 @@ protected final Job setupJob(Job job, String archiveTable, String variantTable) @Override protected void postExecution(boolean succeed) throws IOException, StorageEngineException { super.postExecution(succeed); - if (succeed) { - if (localOutput != null) { - concatMrOutputToLocal(outdir, localOutput); - } - } if (localOutput != null) { + if (succeed) { + copyMrOutputToLocal(); + } deleteTemporaryFile(outdir); } } + protected void copyMrOutputToLocal() throws IOException { + concatMrOutputToLocal(outdir, localOutput, true, null); + } + } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java index d1c2e73ad4d..9314c1e9818 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java @@ -1,6 +1,7 @@ package org.opencb.opencga.storage.hadoop.variant.mr; import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; @@ -138,7 +139,11 @@ protected void setupJob(Job job) throws IOException { reducerClass = StreamVariantReducer.class; - outputFormatClass = ValueOnlyTextOutputFormat.class; + MultipleOutputs.addNamedOutput(job, "stdout", ValueOnlyTextOutputFormat.class, keyClass, valueClass); + MultipleOutputs.addNamedOutput(job, "stderr", ValueOnlyTextOutputFormat.class, keyClass, valueClass); + LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); + outputFormatClass = LazyOutputFormat.class; + job.setOutputFormatClass(ValueOnlyTextOutputFormat.class); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, GzipCodec.class); @@ -152,6 +157,15 @@ protected String getJobOperationName() { return "stream-variants"; } + + @Override + protected void copyMrOutputToLocal() throws IOException { + concatMrOutputToLocal(outdir, localOutput, true, "stdout"); + Path stderrOutput = localOutput.suffix(".stderr.txt.gz"); + concatMrOutputToLocal(outdir, stderrOutput, true, "stderr"); + printKeyValue("EXTRA_OUTPUT_STDERR", stderrOutput); + } + @SuppressWarnings("unchecked") public static void main(String[] args) { main(args, (Class) MethodHandles.lookup().lookupClass()); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 09845a8a617..58f3a87188a 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -2,6 +2,7 @@ import com.fasterxml.jackson.databind.MapperFeature; import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.lang3.time.StopWatch; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -17,6 +18,7 @@ import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.exec.Command; import org.opencb.commons.io.DataWriter; +import org.opencb.opencga.core.common.TimeUtils; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.VariantMetadataFactory; import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; @@ -52,9 +54,13 @@ public class StreamVariantMapper extends VariantMapper 01 // 3 -> 03 @@ -118,8 +130,7 @@ protected void setup(Context context) throws IOException, InterruptedException { chromosome = "0" + chromosome; } - outputKeyPrefix = String.format("%s|%010d|", chromosome, variant.getStart()); - outputKeyNum = 0; + return String.format("%s|%010d|", chromosome, start); } @Override @@ -334,6 +345,7 @@ private void startProcess(Context context) throws IOException, StorageEngineExce // envs.forEach((k, v) -> LOG.info("Config ENV: " + k + "=" + v)); builder.environment().putAll(envs); process = builder.start(); + processCount++; stdin = new DataOutputStream(new BufferedOutputStream( process.getOutputStream(), @@ -415,6 +427,7 @@ private class MROutputThread extends Thread { private final Mapper.Context context; private long lastStdoutReport = 0; + private int numRecords = 0; MROutputThread(Context context) { this.context = context; @@ -426,8 +439,7 @@ public void run() { LineReader stdoutLineReader = new LineReader(stdout); try { while (stdoutLineReader.readLine(line) > 0) { - context.write(new ImmutableBytesWritable(Bytes.toBytes(outputKeyPrefix + (outputKeyNum++))), line); -// context.write(null, line); + write(line); if (verboseStdout) { LOG.info("[STDOUT] - " + line); } @@ -448,6 +460,12 @@ public void run() { addException(th); } } + + private void write(Text line) throws IOException, InterruptedException { + numRecords++; + context.write(new ImmutableBytesWritable( + Bytes.toBytes(StreamVariantReducer.STDOUT_KEY + outputKeyPrefix + (stdoutKeyNum++))), line); + } } private class MRErrorThread extends Thread { @@ -475,6 +493,13 @@ public void run() { Text line = new Text(); LineReader stderrLineReader = new LineReader(stderr); try { + StopWatch stopWatch = StopWatch.createStarted(); + write("---------- " + context.getTaskAttemptID().toString() + " -----------"); + write("Start time : " + TimeUtils.getTimeMillis()); + write("Batch start : " + firstKey + " -> " + outputKeyPrefix); + write("sub-process #" + processCount); + write("--- START STDERR ---"); + int numRecords = 0; while (stderrLineReader.readLine(line) > 0) { String lineStr = line.toString(); if (matchesReporter(lineStr)) { @@ -493,6 +518,8 @@ public void run() { while (stderrBufferSize > STDERR_BUFFER_CAPACITY && stderrBuffer.size() > 3) { stderrBufferSize -= stderrBuffer.remove().length(); } + write(line); + numRecords++; LOG.info("[STDERR] - " + lineStr); } long now = System.currentTimeMillis(); @@ -502,11 +529,24 @@ public void run() { } line.clear(); } + write("--- END STDERR ---"); + write("Execution time : " + TimeUtils.durationToString(stopWatch)); + write("STDOUT lines : " + stdoutThread.numRecords); + write("STDERR lines : " + numRecords); } catch (Throwable th) { addException(th); } } + private void write(String line) throws IOException, InterruptedException { + write(new Text(line)); + } + + private void write(Text line) throws IOException, InterruptedException { + context.write(new ImmutableBytesWritable( + Bytes.toBytes(StreamVariantReducer.STDERR_KEY + outputKeyPrefix + (stderrKeyNum++))), line); + } + private boolean matchesReporter(String line) { return line.startsWith(reporterPrefix); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java index dac1b5dae4e..cfe798a5b81 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java @@ -3,31 +3,47 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; import java.io.IOException; public class StreamVariantReducer extends Reducer { + + public static final String STDOUT_KEY = "O:"; + public static final byte[] STDOUT_KEY_BYTES = Bytes.toBytes(STDOUT_KEY); + public static final String STDERR_KEY = "E:"; + public static final byte[] STDERR_KEY_BYTES = Bytes.toBytes(STDERR_KEY); + private static final Log LOG = LogFactory.getLog(StreamVariantReducer.class); + private MultipleOutputs mos; @Override protected void setup(Reducer.Context context) throws IOException, InterruptedException { super.setup(context); + mos = new MultipleOutputs<>(context); } @Override - protected void reduce(ImmutableBytesWritable key, Iterable values, Reducer.Context context) throws IOException, InterruptedException { - + protected void reduce(ImmutableBytesWritable key, Iterable values, Reducer.Context context) + throws IOException, InterruptedException { for (Text value : values) { - context.write(key, value); - context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stdout_records").increment(1); + if (Bytes.equals(key.get(), key.getOffset(), STDOUT_KEY_BYTES.length, STDOUT_KEY_BYTES, 0, STDOUT_KEY_BYTES.length)) { + mos.write("stdout", key, value); + context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stdout_records").increment(1); + } else { + mos.write("stderr", key, value); + context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stderr_records").increment(1); + } + context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "records").increment(1); } - } @Override protected void cleanup(Reducer.Context context) throws IOException, InterruptedException { super.cleanup(context); + mos.close(); } } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapperTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapperTest.java new file mode 100644 index 00000000000..690a16df5f8 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapperTest.java @@ -0,0 +1,41 @@ +package org.opencb.opencga.storage.hadoop.variant.mr; + +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.opencb.opencga.core.testclassification.duration.ShortTests; + +import static org.junit.Assert.*; + + +@Category(ShortTests.class) +public class StreamVariantMapperTest { + @Test + public void buildOutputKeyPrefixSingleDigitChromosome() { + String result = StreamVariantMapper.buildOutputKeyPrefix("1", 100); + assertEquals("01|0000000100|", result); + } + + @Test + public void buildOutputKeyPrefixDoubleDigitChromosome() { + String result = StreamVariantMapper.buildOutputKeyPrefix("22", 100); + assertEquals("22|0000000100|", result); + } + + @Test + public void buildOutputKeyPrefixRandomChromosome() { + String result = StreamVariantMapper.buildOutputKeyPrefix("1_KI270712v1_random", 100); + assertEquals("01_KI270712v1_random|0000000100|", result); + } + + @Test + public void buildOutputKeyPrefixMTChromosome() { + String result = StreamVariantMapper.buildOutputKeyPrefix("MT", 100); + assertEquals("MT|0000000100|", result); + } + + @Test + public void buildOutputKeyPrefixXChromosome() { + String result = StreamVariantMapper.buildOutputKeyPrefix("X", 100); + assertEquals("X|0000000100|", result); + } +} \ No newline at end of file From 154befa4baec3f7bba3773eb7ec7b6c1b5f7f438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 25 Oct 2024 10:21:57 +0100 Subject: [PATCH 09/66] storage: Do not write multiple headers. #TASK-6722 --- .../variant/mr/StreamVariantReducer.java | 43 ++++++++++++++++--- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java index cfe798a5b81..3a52bfbfbcb 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java @@ -18,20 +18,37 @@ public class StreamVariantReducer extends Reducer mos; + private boolean headerWritten = false; @Override - protected void setup(Reducer.Context context) throws IOException, InterruptedException { + protected void setup(Reducer.Context context) + throws IOException, InterruptedException { super.setup(context); mos = new MultipleOutputs<>(context); } @Override - protected void reduce(ImmutableBytesWritable key, Iterable values, Reducer.Context context) + protected void reduce(ImmutableBytesWritable key, Iterable values, + Reducer.Context context) throws IOException, InterruptedException { for (Text value : values) { - if (Bytes.equals(key.get(), key.getOffset(), STDOUT_KEY_BYTES.length, STDOUT_KEY_BYTES, 0, STDOUT_KEY_BYTES.length)) { - mos.write("stdout", key, value); + if (hasPrefix(key, STDOUT_KEY_BYTES)) { + if (hasPrefix(value, HEADER_PREFIX_BYTES)) { + if (headerWritten) { + // skip header + context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "header_records_skip").increment(1); + } else { + mos.write("stdout", key, value); + context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "header_records").increment(1); + } + } else { + // No more header, assume all header is written + headerWritten = true; + mos.write("stdout", key, value); + context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "body_records").increment(1); + } context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stdout_records").increment(1); } else { mos.write("stderr", key, value); @@ -41,8 +58,24 @@ protected void reduce(ImmutableBytesWritable key, Iterable values, Reducer } } + private static boolean hasPrefix(ImmutableBytesWritable key, byte[] prefix) { + return hasPrefix(key.get(), key.getOffset(), key.getLength(), prefix); + } + + private static boolean hasPrefix(Text text, byte[] prefix) { + return hasPrefix(text.getBytes(), 0, text.getLength(), prefix); + } + + private static boolean hasPrefix(byte[] key, int offset, int length, byte[] prefix) { + if (length < prefix.length) { + return false; + } + return Bytes.equals(key, offset, prefix.length, prefix, 0, prefix.length); + } + @Override - protected void cleanup(Reducer.Context context) throws IOException, InterruptedException { + protected void cleanup(Reducer.Context context) + throws IOException, InterruptedException { super.cleanup(context); mos.close(); } From 85aac6d78d03ef2e4036f53d1fddb6b48185bdaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 25 Oct 2024 14:15:28 +0100 Subject: [PATCH 10/66] storage: Fix NoSuchMethodError creating StopWatch. #TASK-6722 --- .../storage/hadoop/variant/mr/StreamVariantMapper.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 58f3a87188a..626a017512f 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -2,7 +2,6 @@ import com.fasterxml.jackson.databind.MapperFeature; import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.commons.lang3.time.StopWatch; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -12,6 +11,7 @@ import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.util.LineReader; +import org.apache.hadoop.util.StopWatch; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.metadata.VariantMetadata; import org.opencb.commons.datastore.core.Query; @@ -27,6 +27,7 @@ import java.io.*; import java.util.*; +import java.util.concurrent.TimeUnit; import static org.opencb.opencga.storage.hadoop.variant.mr.VariantsTableMapReduceHelper.COUNTER_GROUP_NAME; @@ -493,7 +494,8 @@ public void run() { Text line = new Text(); LineReader stderrLineReader = new LineReader(stderr); try { - StopWatch stopWatch = StopWatch.createStarted(); + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); write("---------- " + context.getTaskAttemptID().toString() + " -----------"); write("Start time : " + TimeUtils.getTimeMillis()); write("Batch start : " + firstKey + " -> " + outputKeyPrefix); @@ -530,7 +532,7 @@ public void run() { line.clear(); } write("--- END STDERR ---"); - write("Execution time : " + TimeUtils.durationToString(stopWatch)); + write("Execution time : " + TimeUtils.durationToString(stopWatch.now(TimeUnit.MILLISECONDS))); write("STDOUT lines : " + stdoutThread.numRecords); write("STDERR lines : " + numRecords); } catch (Throwable th) { From 697b08bea39a36575658c2c04640eb5eee8b1a80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 25 Oct 2024 15:36:23 +0100 Subject: [PATCH 11/66] storage: Ensure stderr file is moved from scratch dir. #TASK-6722 --- .../opencb/opencga/analysis/variant/VariantWalkerTool.java | 2 +- .../analysis/variant/manager/VariantStorageManager.java | 2 +- .../opencga/storage/core/variant/VariantStorageEngine.java | 4 ++-- .../storage/core/variant/dummy/DummyVariantStorageEngine.java | 2 +- .../storage/hadoop/variant/HadoopVariantStorageEngine.java | 4 ++-- .../storage/hadoop/variant/mr/StreamVariantDriver.java | 3 ++- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java index 56e008e0daf..68ad63d3549 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java @@ -82,7 +82,7 @@ protected void run() throws Exception { Query query = toolParams.toQuery(); QueryOptions queryOptions = new QueryOptions().append(QueryOptions.INCLUDE, toolParams.getInclude()) .append(QueryOptions.EXCLUDE, toolParams.getExclude()); - uris.add(variantStorageManager.walkData(outputFile, + uris.addAll(variantStorageManager.walkData(outputFile, format, query, queryOptions, toolParams.getDockerImage(), toolParams.getCommandLine(), token)); }); step("move-files", () -> { diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java index f292e6d6a33..a5d02ab0205 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java @@ -204,7 +204,7 @@ public List exportData(String outputFile, VariantOutputFormat outputFormat, * @throws StorageEngineException If there is any error exporting variants * @return generated files */ - public URI walkData(String outputFile, VariantOutputFormat format, + public List walkData(String outputFile, VariantOutputFormat format, Query query, QueryOptions queryOptions, String dockerImage, String commandLine, String token) throws CatalogException, StorageEngineException { String anyStudy = catalogUtils.getAnyStudy(query, token); diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java index b10b2c73058..81ddc4c0e3d 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java @@ -285,7 +285,7 @@ public List exportData(URI outputFile, VariantOutputFormat outputFormat, UR return exporter.export(outputFile, outputFormat, variantsFile, parsedVariantQuery); } - public URI walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, Query query, QueryOptions queryOptions, + public List walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, Query query, QueryOptions queryOptions, String dockerImage, String commandLine) throws IOException, StorageEngineException { if (format == VariantWriterFactory.VariantOutputFormat.VCF || format == VariantWriterFactory.VariantOutputFormat.VCF_GZ) { @@ -326,7 +326,7 @@ public URI walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat for } - public abstract URI walkData(URI outputFile, VariantOutputFormat format, Query query, QueryOptions queryOptions, + public abstract List walkData(URI outputFile, VariantOutputFormat format, Query query, QueryOptions queryOptions, String commandLine) throws StorageEngineException; diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java index 65a0169ef8d..e10370dcaaf 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java @@ -143,7 +143,7 @@ public void importData(URI input, VariantMetadata metadata, List walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, Query query, QueryOptions queryOptions, String commandLine) throws StorageEngineException { throw new UnsupportedOperationException("Unable to walk data in " + getStorageEngineId()); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java index 8598407233d..023dbbaeec0 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java @@ -317,7 +317,7 @@ protected VariantExporter newVariantExporter(VariantMetadataFactory metadataFact } @Override - public URI walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, + public List walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, Query query, QueryOptions queryOptions, String commandLine) throws StorageEngineException { ParsedVariantQuery variantQuery = parseQuery(query, queryOptions); int studyId = variantQuery.getStudyQuery().getDefaultStudy().getId(); @@ -342,7 +342,7 @@ public URI walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat for .append(StreamVariantDriver.INPUT_FORMAT_PARAM, format.toString()) .append(StreamVariantDriver.OUTPUT_PARAM, outputFile) ), "Walk data"); - return outputFile; + return Arrays.asList(outputFile, UriUtils.createUriSafe(outputFile.toString() + StreamVariantDriver.STDERR_TXT_GZ)); } @Override diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java index 9314c1e9818..cdab0e9e3c7 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java @@ -33,6 +33,7 @@ public class StreamVariantDriver extends VariantDriver { public static final String COMMAND_LINE_BASE64_PARAM = "commandLineBase64"; public static final String MAX_BYTES_PER_MAP_PARAM = "maxBytesPerMap"; public static final String ENVIRONMENT_VARIABLES = "envVars"; + public static final String STDERR_TXT_GZ = ".stderr.txt.gz"; private VariantWriterFactory.VariantOutputFormat format; private int maxBytesPerMap; @@ -161,7 +162,7 @@ protected String getJobOperationName() { @Override protected void copyMrOutputToLocal() throws IOException { concatMrOutputToLocal(outdir, localOutput, true, "stdout"); - Path stderrOutput = localOutput.suffix(".stderr.txt.gz"); + Path stderrOutput = localOutput.suffix(STDERR_TXT_GZ); concatMrOutputToLocal(outdir, stderrOutput, true, "stderr"); printKeyValue("EXTRA_OUTPUT_STDERR", stderrOutput); } From 356567e96e9f13b087c2857cf3a763718cf9694e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 25 Oct 2024 16:23:20 +0100 Subject: [PATCH 12/66] storage: Fix stderr sorting. #TASK-6722 --- .../storage/hadoop/variant/mr/StreamVariantMapper.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 626a017512f..f8bba49fbcf 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -465,7 +465,7 @@ public void run() { private void write(Text line) throws IOException, InterruptedException { numRecords++; context.write(new ImmutableBytesWritable( - Bytes.toBytes(StreamVariantReducer.STDOUT_KEY + outputKeyPrefix + (stdoutKeyNum++))), line); + Bytes.toBytes(String.format("%s%s%08d", StreamVariantReducer.STDOUT_KEY, outputKeyPrefix, stdoutKeyNum++))), line); } } @@ -546,7 +546,7 @@ private void write(String line) throws IOException, InterruptedException { private void write(Text line) throws IOException, InterruptedException { context.write(new ImmutableBytesWritable( - Bytes.toBytes(StreamVariantReducer.STDERR_KEY + outputKeyPrefix + (stderrKeyNum++))), line); + Bytes.toBytes(String.format("%s%s%08d", StreamVariantReducer.STDERR_KEY, outputKeyPrefix, stderrKeyNum++))), line); } private boolean matchesReporter(String line) { From 6253da302c4137e8d4e0b491b009b9a090879cc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 25 Oct 2024 17:47:14 +0100 Subject: [PATCH 13/66] storage: Write `\n` after the json header #TASK-6722 --- .../opencga/storage/hadoop/variant/mr/StreamVariantMapper.java | 1 + 1 file changed, 1 insertion(+) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index f8bba49fbcf..0e45c182c0e 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -372,6 +372,7 @@ private void startProcess(Context context) throws IOException, StorageEngineExce } ObjectMapper objectMapper = new ObjectMapper().configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); objectMapper.writeValue((DataOutput) stdin, metadata); + stdin.write('\n'); } processedBytes = 0; From 5789628871a47ebadc502a48e5137aa18d8d283e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 29 Oct 2024 16:02:35 +0000 Subject: [PATCH 14/66] storage: Do not interrupt header with empty records. #TASK-6722 --- .../hadoop/variant/mr/StreamVariantReducer.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java index 3a52bfbfbcb..81e31be888c 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java @@ -1,5 +1,6 @@ package org.opencb.opencga.storage.hadoop.variant.mr; +import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; @@ -44,8 +45,13 @@ protected void reduce(ImmutableBytesWritable key, Iterable values, context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "header_records").increment(1); } } else { - // No more header, assume all header is written - headerWritten = true; + if (value.getLength() < 3 && StringUtils.isBlank(value.toString())) { + context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stdout_records_empty").increment(1); + // Do not interrupt header with empty records + } else { + // No more header, assume all header is written + headerWritten = true; + } mos.write("stdout", key, value); context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "body_records").increment(1); } From 4ff0655de9f412a9823a8ddb1219411fdeeb844a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 29 Oct 2024 17:26:10 +0000 Subject: [PATCH 15/66] storage: Add a custom Partitioner to ensure sorted data with multiple reducers #TASK-6722 --- .../storage/hadoop/variant/GenomeHelper.java | 42 +++++++++--- .../hadoop/variant/io/VariantDriver.java | 11 ++-- .../variant/mr/StreamVariantDriver.java | 21 ++++++ .../variant/mr/StreamVariantMapper.java | 1 + .../variant/mr/StreamVariantPartitioner.java | 64 +++++++++++++++++++ .../variant/mr/StreamVariantReducer.java | 1 + .../mr/StreamVariantPartitionerTest.java | 57 +++++++++++++++++ 7 files changed, 185 insertions(+), 12 deletions(-) create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitioner.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitionerTest.java diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/GenomeHelper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/GenomeHelper.java index 680f312a409..0276898a612 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/GenomeHelper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/GenomeHelper.java @@ -97,12 +97,25 @@ public int getChunkSize() { } /** - * TODO: Query CellBase to get the chromosomes and sizes! * @param numberOfSplits Number of splits * @param keyGenerator Function to generate the rowKeys given a chromosome and a start * @return List of splits */ public static List generateBootPreSplitsHuman(int numberOfSplits, BiFunction keyGenerator) { + return generateBootPreSplitsHuman(numberOfSplits, keyGenerator, Bytes::compareTo, true); + } + + /** + * TODO: Query CellBase to get the chromosomes and sizes! + * @param numberOfSplits Number of splits + * @param keyGenerator Function to generate the rowKeys given a chromosome and a start + * @param compareTo Comparator to sort the splits + * @param includeEndSplit Include the last split + * @param Type of the split + * @return List of splits + */ + public static List generateBootPreSplitsHuman(int numberOfSplits, BiFunction keyGenerator, + Comparator compareTo, boolean includeEndSplit) { String[] chr = new String[]{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", }; long[] posarr = new long[]{249250621, 243199373, 198022430, 191154276, 180915260, 171115067, 159138663, @@ -112,20 +125,20 @@ public static List generateBootPreSplitsHuman(int numberOfSplits, BiFunc for (int i = 0; i < chr.length; i++) { regions.put(chr[i], posarr[i]); } - return generateBootPreSplits(numberOfSplits, keyGenerator, regions); + return generateBootPreSplits(numberOfSplits, keyGenerator, regions, compareTo, includeEndSplit); } - static List generateBootPreSplits(int numberOfSplits, BiFunction keyGenerator, - Map regionsMap) { + static List generateBootPreSplits(int numberOfSplits, BiFunction keyGenerator, + Map regionsMap, Comparator comparator, boolean includeEndSplit) { // Create a sorted map for the regions that sorts as will sort HBase given the row_key generator // In archive table, chr1 goes after chr19, and in Variants table, chr1 is always the first SortedMap sortedRegions = new TreeMap<>((s1, s2) -> - Bytes.compareTo(keyGenerator.apply(s1, 0), keyGenerator.apply(s2, 0))); + comparator.compare(keyGenerator.apply(s1, 0), keyGenerator.apply(s2, 0))); sortedRegions.putAll(regionsMap); - long total = sortedRegions.values().stream().reduce((a, b) -> a + b).orElse(0L); + long total = regionsMap.values().stream().mapToLong(Long::longValue).sum(); long chunkSize = total / numberOfSplits; - List splitList = new ArrayList<>(); + List splitList = new ArrayList<>(); long splitPos = chunkSize; while (splitPos < total) { long tmpPos = 0; @@ -139,10 +152,23 @@ static List generateBootPreSplits(int numberOfSplits, BiFunction admin.getClusterStatus().getServersSize()); + // Set the number of reduce tasks to 2x the number of hosts + reduceTasks = serversSize * 2; + logger.info("Set reduce tasks to " + reduceTasks + " (derived from 'number_of_servers * 2')"); + } + job.setReducerClass(getReducerClass()); + job.setPartitionerClass(StreamVariantPartitioner.class); + job.setNumReduceTasks(reduceTasks); + // TODO: Use a grouping comparator to group by chromosome and position, ignoring the rest of the key? +// job.setGroupingComparatorClass(StreamVariantGroupingComparator.class); +// job.setSortComparatorClass(); + } + @Override protected String getJobOperationName() { return "stream-variants"; diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 0e45c182c0e..cfcf3604522 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -143,6 +143,7 @@ public void run(Context context) throws IOException, InterruptedException { startProcess(context); // Do-while instead of "while", as we've already called context.nextKeyValue() once do { + // FIXME: If the chromosome is different, we should start a new process and get a new outputKeyPrefix if (processedBytes > maxInputBytesPerProcess) { LOG.info("Processed bytes = " + processedBytes + " > " + maxInputBytesPerProcess + ". Restarting process."); context.getCounter(COUNTER_GROUP_NAME, "RESTARTED_PROCESS").increment(1); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitioner.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitioner.java new file mode 100644 index 00000000000..658ff0329f1 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitioner.java @@ -0,0 +1,64 @@ +package org.opencb.opencga.storage.hadoop.variant.mr; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Partitioner; +import org.opencb.opencga.storage.hadoop.variant.GenomeHelper; + +import javax.xml.soap.Text; +import java.io.IOException; +import java.util.List; +import java.util.TreeMap; + +public class StreamVariantPartitioner extends Partitioner implements Configurable { + + private TreeMap regionSplitsMap = new TreeMap<>(); + private Configuration conf; + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + try { + Job job = Job.getInstance(conf); + int numReduceTasks = job.getNumReduceTasks(); + setup(numReduceTasks); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public TreeMap setup(int numPartitions) { + List splits = GenomeHelper.generateBootPreSplitsHuman( + numPartitions, StreamVariantMapper::buildOutputKeyPrefix, String::compareTo, false); + regionSplitsMap.put(StreamVariantMapper.buildOutputKeyPrefix("0", 0), 0); + for (int i = 0; i < splits.size(); i++) { + regionSplitsMap.put(splits.get(i), regionSplitsMap.size()); + } + return regionSplitsMap; + } + + @Override + public Configuration getConf() { + return conf; + } + + @Override + public int getPartition(ImmutableBytesWritable key, Text text, int numPartitions) { + int start = key.getOffset() + StreamVariantReducer.STDOUT_KEY_BYTES.length; + byte[] bytes = key.get(); + // Find last '|' + int idx = 0; + for (int i = key.getLength() + key.getOffset() - 1; i >= 0; i--) { + if (bytes[i] == '|') { + idx = i; + break; + } + } + String chrPos = Bytes.toString(bytes, start, idx - start); + return regionSplitsMap.floorEntry(chrPos).getValue(); + } + +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java index 81e31be888c..a6684c2d072 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java @@ -45,6 +45,7 @@ protected void reduce(ImmutableBytesWritable key, Iterable values, context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "header_records").increment(1); } } else { + // length < 3 to include lines with a small combination of \n \r \t and spaces. if (value.getLength() < 3 && StringUtils.isBlank(value.toString())) { context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stdout_records_empty").increment(1); // Do not interrupt header with empty records diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitionerTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitionerTest.java new file mode 100644 index 00000000000..bbce3cd5cf0 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitionerTest.java @@ -0,0 +1,57 @@ +package org.opencb.opencga.storage.hadoop.variant.mr; + +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.opencb.opencga.core.testclassification.duration.ShortTests; + +import static org.junit.Assert.assertEquals; + +@Category(ShortTests.class) +public class StreamVariantPartitionerTest { + + public static final int NUM_PARTITIONS = 10; + private StreamVariantPartitioner partitioner; + + @Before + public void setUp() { + partitioner = new StreamVariantPartitioner(); + partitioner.setup(NUM_PARTITIONS); + } + + @Test + public void partitionerTest() { + assertEquals(0, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:00|0000000001|")), null, NUM_PARTITIONS)); + assertEquals(0, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:01|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(0, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:02|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(1, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:03|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(2, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:04|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(2, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:05|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(3, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:06|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(3, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:07|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(4, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:08|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(4, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:09|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(5, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:10|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(5, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:11|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(6, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:12|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(6, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:13|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(7, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:14|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(7, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:15|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(7, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:16|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:17|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:17_random_contig|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:18|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:19|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:20|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:21|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(9, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:22|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(9, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:X|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(9, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:Y|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(9, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:MT|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(9, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:Z|0000000000|")), null, NUM_PARTITIONS)); + assertEquals(9, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:Z_random_contig|0000000000|")), null, NUM_PARTITIONS)); + } + +} \ No newline at end of file From 82682667d9de2a516c7d1341446c9b8e670449c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 29 Oct 2024 17:50:53 +0000 Subject: [PATCH 16/66] storage: Fix partitioner. #TASK-6722 --- .../storage/hadoop/variant/mr/StreamVariantDriver.java | 5 +++-- .../storage/hadoop/variant/mr/StreamVariantPartitioner.java | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java index dede7b67896..ccf20074087 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java @@ -155,11 +155,12 @@ protected void setupJob(Job job) throws IOException { @Override protected void setupReducer(Job job, String variantTableName) throws IOException { - String numReducersStr = getParam(JobContext.NUM_REDUCES); + String numReducersKey = getClass().getSimpleName() + "." + JobContext.NUM_REDUCES; + String numReducersStr = getParam(numReducersKey); int reduceTasks; if (StringUtils.isNotEmpty(numReducersStr)) { reduceTasks = Integer.parseInt(numReducersStr); - logger.info("Set reduce tasks to " + reduceTasks + " (derived from input parameter '" + JobContext.NUM_REDUCES + "')"); + logger.info("Set reduce tasks to " + reduceTasks + " (derived from input parameter '" + numReducersKey + "')"); } else { int serversSize = getHBaseManager().act(variantTableName, (table, admin) -> admin.getClusterStatus().getServersSize()); // Set the number of reduce tasks to 2x the number of hosts diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitioner.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitioner.java index 658ff0329f1..d2e1f0056c2 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitioner.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitioner.java @@ -4,11 +4,11 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Partitioner; import org.opencb.opencga.storage.hadoop.variant.GenomeHelper; -import javax.xml.soap.Text; import java.io.IOException; import java.util.List; import java.util.TreeMap; From 4147d0157ef4a30f7af953be237b3b345c8cbb6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 29 Oct 2024 17:52:15 +0000 Subject: [PATCH 17/66] storage: Restart process when changing chromosome to ensure correct sorting. #TASK-6722 --- .../variant/mr/StreamVariantMapper.java | 58 +++++++++++++------ 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index cfcf3604522..68993980993 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -53,16 +53,13 @@ public class StreamVariantMapper extends VariantMapper variantDataWriter; + protected final List throwables = Collections.synchronizedList(new ArrayList<>()); private int processedBytes = 0; private long numRecordsRead = 0; private long numRecordsWritten = 0; - protected final List throwables = Collections.synchronizedList(new ArrayList<>()); + // auto-incremental number for each produced record. + // This is used with the outputKeyPrefix to ensure a sorted output. + private int stdoutKeyNum; + private int stderrKeyNum; + private String currentChromosome; + private int currentPosition; + private String outputKeyPrefix; private volatile boolean processProvidedStatus_ = false; private VariantMetadata metadata; @@ -112,11 +116,6 @@ protected void setup(Context context) throws IOException, InterruptedException { writerFactory = new VariantWriterFactory(metadataManager); query = VariantMapReduceUtil.getQueryFromConfig(conf); options = VariantMapReduceUtil.getQueryOptionsFromConfig(conf); - Variant variant = context.getCurrentValue(); - firstKey = variant.getChromosome() + ":" + variant.getStart(); - outputKeyPrefix = buildOutputKeyPrefix(variant.getChromosome(), variant.getStart()); - stdoutKeyNum = 0; - stderrKeyNum = 0; } public static String buildOutputKeyPrefix(String chromosome, Integer start) { @@ -143,14 +142,17 @@ public void run(Context context) throws IOException, InterruptedException { startProcess(context); // Do-while instead of "while", as we've already called context.nextKeyValue() once do { - // FIXME: If the chromosome is different, we should start a new process and get a new outputKeyPrefix + currentValue = context.getCurrentValue(); + // Restart the process if the input bytes exceed the limit + // or if the chromosome changes if (processedBytes > maxInputBytesPerProcess) { LOG.info("Processed bytes = " + processedBytes + " > " + maxInputBytesPerProcess + ". Restarting process."); - context.getCounter(COUNTER_GROUP_NAME, "RESTARTED_PROCESS").increment(1); - closeProcess(context); - startProcess(context); + restartProcess(context, "BYTES_LIMIT"); + } else if (!currentChromosome.equals(currentValue.getChromosome())) { + LOG.info("Chromosome changed from " + currentChromosome + " to " + currentValue.getChromosome() + + ". Restarting process."); + restartProcess(context, "CHR_CHANGE"); } - currentValue = context.getCurrentValue(); map(context.getCurrentKey(), currentValue, context); } while (!hasExceptions() && context.nextKeyValue()); } catch (Throwable th) { @@ -187,6 +189,13 @@ public void run(Context context) throws IOException, InterruptedException { throwExceptionIfAny(); } + private void restartProcess(Mapper.Context context, String reason) + throws IOException, InterruptedException, StorageEngineException { + context.getCounter(COUNTER_GROUP_NAME, "RESTARTED_PROCESS_" + reason).increment(1); + closeProcess(context); + startProcess(context); + } + private boolean hasExceptions() { return !throwables.isEmpty(); } @@ -336,10 +345,20 @@ private void closeProcess(Context context) throws IOException, InterruptedExcept // drainStdout(context); } - private void startProcess(Context context) throws IOException, StorageEngineException { + private void startProcess(Context context) throws IOException, StorageEngineException, InterruptedException { LOG.info("bash -ce '" + commandLine + "'"); context.getCounter(COUNTER_GROUP_NAME, "START_PROCESS").increment(1); + Variant variant = context.getCurrentValue(); + currentChromosome = variant.getChromosome(); + currentPosition = variant.getStart(); + if (firstVariant == null) { + firstVariant = variant.getChromosome() + ":" + variant.getStart(); + } + outputKeyPrefix = buildOutputKeyPrefix(variant.getChromosome(), variant.getStart()); + stdoutKeyNum = 0; + stderrKeyNum = 0; + // Start the process ProcessBuilder builder = new ProcessBuilder("bash", "-ce", commandLine); // System.getenv().forEach((k, v) -> LOG.info("SYSTEM ENV: " + k + "=" + v)); @@ -500,7 +519,8 @@ public void run() { stopWatch.start(); write("---------- " + context.getTaskAttemptID().toString() + " -----------"); write("Start time : " + TimeUtils.getTimeMillis()); - write("Batch start : " + firstKey + " -> " + outputKeyPrefix); + write("Input split : " + firstVariant); + write("Batch start : " + currentChromosome + ":" + currentPosition + " -> " + outputKeyPrefix); write("sub-process #" + processCount); write("--- START STDERR ---"); int numRecords = 0; From 7fd439a7733872b38c66b5f4d19e2c2fac64d0d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 29 Oct 2024 22:05:26 +0000 Subject: [PATCH 18/66] storage: Fix GenomeHellper generateBootPreSplits. #TASK-6722 --- .../opencb/opencga/storage/hadoop/variant/GenomeHelper.java | 5 +++-- .../storage/hadoop/variant/mr/StreamVariantMapper.java | 1 + .../opencga/storage/hadoop/variant/GenomeHelperTest.java | 5 +++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/GenomeHelper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/GenomeHelper.java index 0276898a612..93059088577 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/GenomeHelper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/GenomeHelper.java @@ -159,9 +159,10 @@ static List generateBootPreSplits(int numberOfSplits, BiFunction " + maxInputBytesPerProcess + ". Restarting process."); restartProcess(context, "BYTES_LIMIT"); } else if (!currentChromosome.equals(currentValue.getChromosome())) { + // TODO: Should we change only when the chromosome change would produce a partition change? LOG.info("Chromosome changed from " + currentChromosome + " to " + currentValue.getChromosome() + ". Restarting process."); restartProcess(context, "CHR_CHANGE"); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/GenomeHelperTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/GenomeHelperTest.java index 4438c668909..ab359f1b915 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/GenomeHelperTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/GenomeHelperTest.java @@ -71,6 +71,11 @@ public void testGenerateSplitArchive() throws Exception { assertOrder(GenomeHelper.generateBootPreSplitsHuman(30, (chr, pos) -> keyFactory.generateBlockIdAsBytes(1, chr, pos)), 30); } + @Test + public void testGenerateSplitArchiveMultiple() throws Exception { + assertOrder(GenomeHelper.generateBootPreSplitsHuman(2, (chr, pos) -> keyFactory.generateBlockIdAsBytes(1, chr, pos)), 2); + } + @Test public void testGenerateSplitVariant() throws Exception { int expectedSize = 10; From e6128b0f5223c3ce2c3517e32e3ccb9722b1ea0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 30 Oct 2024 15:12:34 +0000 Subject: [PATCH 19/66] storage: Do not interrupt header with empty lines while concat. #TASK-6722 --- .../storage/hadoop/utils/AbstractHBaseDriver.java | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index 3773141eefd..e86a8dd3fb7 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -471,9 +471,11 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool if (paths.isEmpty()) { LOGGER.warn("The MapReduce job didn't produce any output. This may not be expected."); } else if (paths.size() == 1) { - LOGGER.info("Copy to local file " + paths.get(0).toUri() + " to " + localOutput.toUri()); + LOGGER.info("Copy to local file"); + LOGGER.info(" Source : {} ({})", + paths.get(0).toUri(), humanReadableByteCount(fileSystem.getFileStatus(paths.get(0)).getLen(), false)); + LOGGER.info(" Target : {}", localOutput.toUri()); fileSystem.copyToLocalFile(false, paths.get(0), localOutput); - LOGGER.info("File size : " + humanReadableByteCount(Files.size(Paths.get(localOutput.toUri())), false)); } else { LOGGER.info("Concat and copy to local " + paths.size()); LOGGER.info(" Source : " + mrOutdir.toUri()); @@ -485,8 +487,8 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool OutputStream os = gzOs == null ? fsOs : gzOs; for (int i = 0; i < paths.size(); i++) { Path path = paths.get(i); - LOGGER.info("Concat {}file : '{}' {} ", - isGzip ? "gzip " : "", + LOGGER.info("Concat {} : '{}' ({}) ", + isGzip ? "gzip file" : "file", path.toUri(), humanReadableByteCount(fileSystem.getFileStatus(path).getLen(), false)); try (FSDataInputStream fsIs = fileSystem.open(path)) { @@ -503,7 +505,8 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool do { br.mark(10 * 1024 * 1024); //10MB line = br.readLine(); - } while (line != null && line.startsWith("#")); + // Skip blank lines and + } while (line != null && (StringUtils.isBlank(line) || line.startsWith("#"))); br.reset(); is = new ReaderInputStream(br, Charset.defaultCharset()); } From 100fecfcc93076cdf51767f2dd8f16d8f352c760 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 31 Oct 2024 13:49:44 +0000 Subject: [PATCH 20/66] storage: Replace ImmutableBytesWritable with VariantLocusKey as map output key. #TASK-6722 --- .../hadoop/variant/io/VariantDriver.java | 32 +++- .../variant/mr/StreamVariantDriver.java | 30 ++-- .../variant/mr/StreamVariantMapper.java | 37 ++--- .../variant/mr/StreamVariantPartitioner.java | 64 -------- .../variant/mr/StreamVariantReducer.java | 14 +- .../hadoop/variant/mr/VariantLocusKey.java | 146 +++++++++++++++++ .../mr/VariantLocusKeyPartitioner.java | 50 ++++++ .../variant/mr/StreamVariantMapperTest.java | 41 ----- .../mr/StreamVariantPartitionerTest.java | 57 ------- .../mr/VariantLocusKeyPartitionerTest.java | 56 +++++++ .../variant/mr/VariantLocusKeyTest.java | 152 ++++++++++++++++++ 11 files changed, 459 insertions(+), 220 deletions(-) delete mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitioner.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKey.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyPartitioner.java delete mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapperTest.java delete mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitionerTest.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyPartitionerTest.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyTest.java diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java index d11b1824aa6..223c0b91559 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java @@ -3,8 +3,10 @@ import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.mapred.JobContext; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; +import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.opencb.commons.datastore.core.Query; @@ -93,6 +95,10 @@ protected void parseAndValidateParameters() throws IOException { protected abstract Class getReducerClass(); + protected Class getPartitioner() { + return null; + } + protected abstract Class getOutputFormatClass(); protected abstract void setupJob(Job job) throws IOException; @@ -160,12 +166,30 @@ protected final Job setupJob(Job job, String archiveTable, String variantTable) } protected void setupReducer(Job job, String variantTable) throws IOException { - logger.info("Use one Reduce task to produce a single file"); - job.setReducerClass(getReducerClass()); - job.setNumReduceTasks(1); + Class partitionerClass = getPartitioner(); + if (partitionerClass == null) { + logger.info("Use one Reduce task to produce a single file"); + job.setReducerClass(getReducerClass()); + job.setNumReduceTasks(1); + } else { + String numReducersKey = getClass().getSimpleName() + "." + JobContext.NUM_REDUCES; + String numReducersStr = getParam(numReducersKey); + int reduceTasks; + if (StringUtils.isNotEmpty(numReducersStr)) { + reduceTasks = Integer.parseInt(numReducersStr); + logger.info("Set reduce tasks to " + reduceTasks + " (derived from input parameter '" + numReducersKey + "')"); + } else { + int serversSize = getHBaseManager().act(variantTable, (table, admin) -> admin.getClusterStatus().getServersSize()); + // Set the number of reduce tasks to 2x times the number of servers + reduceTasks = serversSize * 2; + logger.info("Set reduce tasks to " + reduceTasks + " (derived from 'number_of_servers * 2')"); + } + job.setReducerClass(getReducerClass()); + job.setPartitionerClass(partitionerClass); + job.setNumReduceTasks(reduceTasks); + } } - @Override protected void postExecution(boolean succeed) throws IOException, StorageEngineException { super.postExecution(succeed); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java index ccf20074087..b6cedb5d484 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java @@ -2,7 +2,6 @@ import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.DeflateCodec; @@ -10,6 +9,7 @@ import org.apache.hadoop.mapred.JobContext; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; +import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat; import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; @@ -113,6 +113,11 @@ protected Class getReducerClass() { return reducerClass; } + @Override + protected Class getPartitioner() { + return VariantLocusKeyPartitioner.class; + } + @Override protected Class getOutputFormatClass() { return outputFormatClass; @@ -124,9 +129,10 @@ protected void setupJob(Job job) throws IOException { job.getConfiguration().setBoolean(JobContext.MAP_OUTPUT_COMPRESS, true); job.getConfiguration().setClass(JobContext.MAP_OUTPUT_COMPRESS_CODEC, DeflateCodec.class, CompressionCodec.class); - Class keyClass = ImmutableBytesWritable.class; -// Class keyClass = NullWritable.class; -// Class keyClass = Text.class; + Class keyClass = VariantLocusKey.class; +// Class keyClass = ImmutableBytesWritable.class; +// Class keyClass = NullWritable.class; +// Class keyClass = Text.class; Class valueClass = Text.class; mapperClass = StreamVariantMapper.class; @@ -155,21 +161,7 @@ protected void setupJob(Job job) throws IOException { @Override protected void setupReducer(Job job, String variantTableName) throws IOException { - String numReducersKey = getClass().getSimpleName() + "." + JobContext.NUM_REDUCES; - String numReducersStr = getParam(numReducersKey); - int reduceTasks; - if (StringUtils.isNotEmpty(numReducersStr)) { - reduceTasks = Integer.parseInt(numReducersStr); - logger.info("Set reduce tasks to " + reduceTasks + " (derived from input parameter '" + numReducersKey + "')"); - } else { - int serversSize = getHBaseManager().act(variantTableName, (table, admin) -> admin.getClusterStatus().getServersSize()); - // Set the number of reduce tasks to 2x the number of hosts - reduceTasks = serversSize * 2; - logger.info("Set reduce tasks to " + reduceTasks + " (derived from 'number_of_servers * 2')"); - } - job.setReducerClass(getReducerClass()); - job.setPartitionerClass(StreamVariantPartitioner.class); - job.setNumReduceTasks(reduceTasks); + super.setupReducer(job, variantTableName); // TODO: Use a grouping comparator to group by chromosome and position, ignoring the rest of the key? // job.setGroupingComparatorClass(StreamVariantGroupingComparator.class); // job.setSortComparatorClass(); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index f6fc4d74c5e..03c3aa6b0b0 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -31,7 +31,7 @@ import static org.opencb.opencga.storage.hadoop.variant.mr.VariantsTableMapReduceHelper.COUNTER_GROUP_NAME; -public class StreamVariantMapper extends VariantMapper { +public class StreamVariantMapper extends VariantMapper { private static final Log LOG = LogFactory.getLog(StreamVariantMapper.class); private static final int BUFFER_SIZE = 128 * 1024; @@ -72,12 +72,11 @@ public class StreamVariantMapper extends VariantMapper 01 - // 3 -> 03 - // 22 -> 22 - // If the first character is a digit, and the second is not, add a 0 at the beginning - // MT -> MT - // 1_KI270712v1_random -> 01_KI270712v1_random - if (Character.isDigit(chromosome.charAt(0)) && (chromosome.length() == 1 || !Character.isDigit(chromosome.charAt(1)))) { - chromosome = "0" + chromosome; - } - - return String.format("%s|%010d|", chromosome, start); - } - @Override public void run(Context context) throws IOException, InterruptedException { if (context.nextKeyValue()) { @@ -190,7 +174,7 @@ public void run(Context context) throws IOException, InterruptedException { throwExceptionIfAny(); } - private void restartProcess(Mapper.Context context, String reason) + private void restartProcess(Mapper.Context context, String reason) throws IOException, InterruptedException, StorageEngineException { context.getCounter(COUNTER_GROUP_NAME, "RESTARTED_PROCESS_" + reason).increment(1); closeProcess(context); @@ -263,7 +247,7 @@ private void throwExceptionIfAny() throws IOException { } @Override - protected void cleanup(Mapper.Context context) throws IOException, InterruptedException { + protected void cleanup(Mapper.Context context) throws IOException, InterruptedException { closeProcess(context); dockerPruneImages(); super.cleanup(context); @@ -356,7 +340,6 @@ private void startProcess(Context context) throws IOException, StorageEngineExce if (firstVariant == null) { firstVariant = variant.getChromosome() + ":" + variant.getStart(); } - outputKeyPrefix = buildOutputKeyPrefix(variant.getChromosome(), variant.getStart()); stdoutKeyNum = 0; stderrKeyNum = 0; @@ -448,7 +431,7 @@ public static void addEnvironment(Map env, Configuration conf) { private class MROutputThread extends Thread { - private final Mapper.Context context; + private final Mapper.Context context; private long lastStdoutReport = 0; private int numRecords = 0; @@ -486,15 +469,14 @@ public void run() { private void write(Text line) throws IOException, InterruptedException { numRecords++; - context.write(new ImmutableBytesWritable( - Bytes.toBytes(String.format("%s%s%08d", StreamVariantReducer.STDOUT_KEY, outputKeyPrefix, stdoutKeyNum++))), line); + context.write(new VariantLocusKey(currentChromosome, currentPosition, StreamVariantReducer.STDOUT_KEY + (stdoutKeyNum++)), line); } } private class MRErrorThread extends Thread { private final Configuration conf; - private final Mapper.Context context; + private final Mapper.Context context; private long lastStderrReport = 0; private final String reporterPrefix; private final String counterPrefix; @@ -521,7 +503,7 @@ public void run() { write("---------- " + context.getTaskAttemptID().toString() + " -----------"); write("Start time : " + TimeUtils.getTimeMillis()); write("Input split : " + firstVariant); - write("Batch start : " + currentChromosome + ":" + currentPosition + " -> " + outputKeyPrefix); + write("Batch start : " + currentChromosome + ":" + currentPosition); write("sub-process #" + processCount); write("--- START STDERR ---"); int numRecords = 0; @@ -568,8 +550,7 @@ private void write(String line) throws IOException, InterruptedException { } private void write(Text line) throws IOException, InterruptedException { - context.write(new ImmutableBytesWritable( - Bytes.toBytes(String.format("%s%s%08d", StreamVariantReducer.STDERR_KEY, outputKeyPrefix, stderrKeyNum++))), line); + context.write(new VariantLocusKey(currentChromosome, currentPosition, StreamVariantReducer.STDERR_KEY + (stderrKeyNum++)), line); } private boolean matchesReporter(String line) { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitioner.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitioner.java deleted file mode 100644 index d2e1f0056c2..00000000000 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitioner.java +++ /dev/null @@ -1,64 +0,0 @@ -package org.opencb.opencga.storage.hadoop.variant.mr; - -import org.apache.hadoop.conf.Configurable; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hbase.io.ImmutableBytesWritable; -import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Partitioner; -import org.opencb.opencga.storage.hadoop.variant.GenomeHelper; - -import java.io.IOException; -import java.util.List; -import java.util.TreeMap; - -public class StreamVariantPartitioner extends Partitioner implements Configurable { - - private TreeMap regionSplitsMap = new TreeMap<>(); - private Configuration conf; - - @Override - public void setConf(Configuration conf) { - this.conf = conf; - try { - Job job = Job.getInstance(conf); - int numReduceTasks = job.getNumReduceTasks(); - setup(numReduceTasks); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - public TreeMap setup(int numPartitions) { - List splits = GenomeHelper.generateBootPreSplitsHuman( - numPartitions, StreamVariantMapper::buildOutputKeyPrefix, String::compareTo, false); - regionSplitsMap.put(StreamVariantMapper.buildOutputKeyPrefix("0", 0), 0); - for (int i = 0; i < splits.size(); i++) { - regionSplitsMap.put(splits.get(i), regionSplitsMap.size()); - } - return regionSplitsMap; - } - - @Override - public Configuration getConf() { - return conf; - } - - @Override - public int getPartition(ImmutableBytesWritable key, Text text, int numPartitions) { - int start = key.getOffset() + StreamVariantReducer.STDOUT_KEY_BYTES.length; - byte[] bytes = key.get(); - // Find last '|' - int idx = 0; - for (int i = key.getLength() + key.getOffset() - 1; i >= 0; i--) { - if (bytes[i] == '|') { - idx = i; - break; - } - } - String chrPos = Bytes.toString(bytes, start, idx - start); - return regionSplitsMap.floorEntry(chrPos).getValue(); - } - -} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java index a6684c2d072..25598e593f9 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java @@ -11,7 +11,7 @@ import java.io.IOException; -public class StreamVariantReducer extends Reducer { +public class StreamVariantReducer extends Reducer { public static final String STDOUT_KEY = "O:"; public static final byte[] STDOUT_KEY_BYTES = Bytes.toBytes(STDOUT_KEY); @@ -20,22 +20,22 @@ public class StreamVariantReducer extends Reducer mos; + private MultipleOutputs mos; private boolean headerWritten = false; @Override - protected void setup(Reducer.Context context) + protected void setup(Reducer.Context context) throws IOException, InterruptedException { super.setup(context); mos = new MultipleOutputs<>(context); } @Override - protected void reduce(ImmutableBytesWritable key, Iterable values, - Reducer.Context context) + protected void reduce(VariantLocusKey key, Iterable values, + Reducer.Context context) throws IOException, InterruptedException { for (Text value : values) { - if (hasPrefix(key, STDOUT_KEY_BYTES)) { + if (key.getOther().startsWith(STDOUT_KEY)) { if (hasPrefix(value, HEADER_PREFIX_BYTES)) { if (headerWritten) { // skip header @@ -81,7 +81,7 @@ private static boolean hasPrefix(byte[] key, int offset, int length, byte[] pref } @Override - protected void cleanup(Reducer.Context context) + protected void cleanup(Reducer.Context context) throws IOException, InterruptedException { super.cleanup(context); mos.close(); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKey.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKey.java new file mode 100644 index 00000000000..a198e03de5c --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKey.java @@ -0,0 +1,146 @@ +package org.opencb.opencga.storage.hadoop.variant.mr; + +import org.apache.hadoop.io.WritableComparable; +import org.opencb.biodata.models.variant.Variant; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Objects; + +/** + * Genomic locus key. + */ +public class VariantLocusKey implements WritableComparable { + private String chromosome; + private int position; + private String other; + + public VariantLocusKey() { + } + + public VariantLocusKey(String chromosome, int position) { + this.chromosome = chromosome; + this.position = position; + this.other = null; + } + + public VariantLocusKey(Variant variant) { + this(variant.getChromosome(), variant.getStart(), variant.getReference() + "_" + variant.getAlternate()); + } + + public VariantLocusKey(String chromosome, int position, String other) { + this.chromosome = chromosome; + this.position = position; + this.other = other; + } + + @Override + public int compareTo(VariantLocusKey o) { + String chr1; + String chr2; + if (isSingleDigitChromosome(chromosome)) { + chr1 = "0" + chromosome; + } else { + chr1 = chromosome; + } + if (isSingleDigitChromosome(o.chromosome)) { + chr2 = "0" + o.chromosome; + } else { + chr2 = o.chromosome; + } + int i = chr1.compareTo(chr2); + if (i == 0) { + i = position - o.position; + } + if (i == 0) { + if (other == null) { + i = o.other == null ? 0 : -1; + } else if (o.other == null) { + i = 1; + } else { + i = other.compareTo(o.other); + } + } + return i; + } + + public static boolean isSingleDigitChromosome(String chromosome) { + return Character.isDigit(chromosome.charAt(0)) && (chromosome.length() == 1 || !Character.isDigit(chromosome.charAt(1))); + } + + @Override + public void write(DataOutput out) throws IOException { + out.writeChars(chromosome); + out.writeChars("\n"); + out.writeInt(position); + if (other != null) { + out.writeChars(other); + } else { + out.writeChars(""); + } + } + + @Override + public void readFields(DataInput in) throws IOException { + chromosome = in.readLine(); + position = in.readInt(); + other = in.readLine(); + } + + public String getChromosome() { + return chromosome; + } + + public VariantLocusKey setChromosome(String chromosome) { + this.chromosome = chromosome; + return this; + } + + public int getPosition() { + return position; + } + + public VariantLocusKey setPosition(int position) { + this.position = position; + return this; + } + + public String getOther() { + return other; + } + + public VariantLocusKey setOther(String other) { + this.other = other; + return this; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + VariantLocusKey that = (VariantLocusKey) o; + return position == that.position + && Objects.equals(chromosome, that.chromosome) + && Objects.equals(other, that.other); + } + + @Override + public int hashCode() { + return Objects.hash(chromosome, position, other); + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("VariantLocusKey{"); + sb.append("chromosome='").append(chromosome).append('\''); + sb.append(", position=").append(position); + sb.append(", other='").append(other).append('\''); + sb.append('}'); + return sb.toString(); + } +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyPartitioner.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyPartitioner.java new file mode 100644 index 00000000000..7bb2a4dfa27 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyPartitioner.java @@ -0,0 +1,50 @@ +package org.opencb.opencga.storage.hadoop.variant.mr; + +import org.apache.hadoop.conf.Configurable; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Partitioner; +import org.opencb.opencga.storage.hadoop.variant.GenomeHelper; + +import java.io.IOException; +import java.util.List; +import java.util.TreeMap; + +public class VariantLocusKeyPartitioner extends Partitioner implements Configurable { + + private final TreeMap regionSplitsMap = new TreeMap<>(); + private Configuration conf; + + @Override + public void setConf(Configuration conf) { + this.conf = conf; + try { + Job job = Job.getInstance(conf); + int numReduceTasks = job.getNumReduceTasks(); + setup(numReduceTasks); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public TreeMap setup(int numPartitions) { + List splits = GenomeHelper.generateBootPreSplitsHuman( + numPartitions, VariantLocusKey::new, VariantLocusKey::compareTo, false); + regionSplitsMap.put(new VariantLocusKey("0", 0), 0); + for (int i = 0; i < splits.size(); i++) { + regionSplitsMap.put(splits.get(i), regionSplitsMap.size()); + } + return regionSplitsMap; + } + + @Override + public Configuration getConf() { + return conf; + } + + @Override + public int getPartition(VariantLocusKey variantLocusKey, V v, int numPartitions) { + return regionSplitsMap.floorEntry(variantLocusKey).getValue(); + } + +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapperTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapperTest.java deleted file mode 100644 index 690a16df5f8..00000000000 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapperTest.java +++ /dev/null @@ -1,41 +0,0 @@ -package org.opencb.opencga.storage.hadoop.variant.mr; - -import org.junit.Test; -import org.junit.experimental.categories.Category; -import org.opencb.opencga.core.testclassification.duration.ShortTests; - -import static org.junit.Assert.*; - - -@Category(ShortTests.class) -public class StreamVariantMapperTest { - @Test - public void buildOutputKeyPrefixSingleDigitChromosome() { - String result = StreamVariantMapper.buildOutputKeyPrefix("1", 100); - assertEquals("01|0000000100|", result); - } - - @Test - public void buildOutputKeyPrefixDoubleDigitChromosome() { - String result = StreamVariantMapper.buildOutputKeyPrefix("22", 100); - assertEquals("22|0000000100|", result); - } - - @Test - public void buildOutputKeyPrefixRandomChromosome() { - String result = StreamVariantMapper.buildOutputKeyPrefix("1_KI270712v1_random", 100); - assertEquals("01_KI270712v1_random|0000000100|", result); - } - - @Test - public void buildOutputKeyPrefixMTChromosome() { - String result = StreamVariantMapper.buildOutputKeyPrefix("MT", 100); - assertEquals("MT|0000000100|", result); - } - - @Test - public void buildOutputKeyPrefixXChromosome() { - String result = StreamVariantMapper.buildOutputKeyPrefix("X", 100); - assertEquals("X|0000000100|", result); - } -} \ No newline at end of file diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitionerTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitionerTest.java deleted file mode 100644 index bbce3cd5cf0..00000000000 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantPartitionerTest.java +++ /dev/null @@ -1,57 +0,0 @@ -package org.opencb.opencga.storage.hadoop.variant.mr; - -import org.apache.hadoop.hbase.io.ImmutableBytesWritable; -import org.apache.hadoop.hbase.util.Bytes; -import org.junit.Before; -import org.junit.Test; -import org.junit.experimental.categories.Category; -import org.opencb.opencga.core.testclassification.duration.ShortTests; - -import static org.junit.Assert.assertEquals; - -@Category(ShortTests.class) -public class StreamVariantPartitionerTest { - - public static final int NUM_PARTITIONS = 10; - private StreamVariantPartitioner partitioner; - - @Before - public void setUp() { - partitioner = new StreamVariantPartitioner(); - partitioner.setup(NUM_PARTITIONS); - } - - @Test - public void partitionerTest() { - assertEquals(0, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:00|0000000001|")), null, NUM_PARTITIONS)); - assertEquals(0, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:01|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(0, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:02|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(1, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:03|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(2, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:04|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(2, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:05|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(3, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:06|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(3, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:07|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(4, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:08|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(4, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:09|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(5, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:10|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(5, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:11|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(6, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:12|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(6, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:13|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(7, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:14|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(7, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:15|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(7, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:16|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(8, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:17|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(8, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:17_random_contig|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(8, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:18|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(8, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:19|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(8, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:20|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(8, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:21|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(9, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:22|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(9, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:X|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(9, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:Y|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(9, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:MT|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(9, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:Z|0000000000|")), null, NUM_PARTITIONS)); - assertEquals(9, partitioner.getPartition(new ImmutableBytesWritable(Bytes.toBytes("o:Z_random_contig|0000000000|")), null, NUM_PARTITIONS)); - } - -} \ No newline at end of file diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyPartitionerTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyPartitionerTest.java new file mode 100644 index 00000000000..8c4a1966da3 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyPartitionerTest.java @@ -0,0 +1,56 @@ +package org.opencb.opencga.storage.hadoop.variant.mr; + +import org.junit.Before; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.opencb.opencga.core.testclassification.duration.ShortTests; + +import static org.junit.Assert.assertEquals; + +@Category(ShortTests.class) +public class VariantLocusKeyPartitionerTest { + + public static final int NUM_PARTITIONS = 10; + private VariantLocusKeyPartitioner partitioner; + + @Before + public void setUp() { + partitioner = new VariantLocusKeyPartitioner<>(); + partitioner.setup(NUM_PARTITIONS); + } + + @Test + public void partitionerTest() { + assertEquals(0, partitioner.getPartition(new VariantLocusKey("0",1), null, NUM_PARTITIONS)); + assertEquals(0, partitioner.getPartition(new VariantLocusKey("1",0), null, NUM_PARTITIONS)); + assertEquals(0, partitioner.getPartition(new VariantLocusKey("2",0), null, NUM_PARTITIONS)); + assertEquals(1, partitioner.getPartition(new VariantLocusKey("3",0), null, NUM_PARTITIONS)); + assertEquals(2, partitioner.getPartition(new VariantLocusKey("4",0), null, NUM_PARTITIONS)); + assertEquals(2, partitioner.getPartition(new VariantLocusKey("5",0), null, NUM_PARTITIONS)); + assertEquals(3, partitioner.getPartition(new VariantLocusKey("6",0), null, NUM_PARTITIONS)); + assertEquals(3, partitioner.getPartition(new VariantLocusKey("7",0), null, NUM_PARTITIONS)); + assertEquals(4, partitioner.getPartition(new VariantLocusKey("8",0), null, NUM_PARTITIONS)); + assertEquals(4, partitioner.getPartition(new VariantLocusKey("9",0), null, NUM_PARTITIONS)); + assertEquals(5, partitioner.getPartition(new VariantLocusKey("10",0), null, NUM_PARTITIONS)); + assertEquals(5, partitioner.getPartition(new VariantLocusKey("11",0), null, NUM_PARTITIONS)); + assertEquals(6, partitioner.getPartition(new VariantLocusKey("12",0), null, NUM_PARTITIONS)); + assertEquals(6, partitioner.getPartition(new VariantLocusKey("13",0), null, NUM_PARTITIONS)); + assertEquals(7, partitioner.getPartition(new VariantLocusKey("14",0), null, NUM_PARTITIONS)); + assertEquals(7, partitioner.getPartition(new VariantLocusKey("15",0), null, NUM_PARTITIONS)); + assertEquals(7, partitioner.getPartition(new VariantLocusKey("16",0), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new VariantLocusKey("17",0), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new VariantLocusKey("17_random_contig",0), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new VariantLocusKey("18",0), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new VariantLocusKey("18",70880000), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new VariantLocusKey("19",0), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new VariantLocusKey("20",0), null, NUM_PARTITIONS)); + assertEquals(8, partitioner.getPartition(new VariantLocusKey("21",0), null, NUM_PARTITIONS)); + assertEquals(9, partitioner.getPartition(new VariantLocusKey("22",0), null, NUM_PARTITIONS)); + assertEquals(9, partitioner.getPartition(new VariantLocusKey("X",0), null, NUM_PARTITIONS)); + assertEquals(9, partitioner.getPartition(new VariantLocusKey("Y",0), null, NUM_PARTITIONS)); + assertEquals(9, partitioner.getPartition(new VariantLocusKey("MT",0), null, NUM_PARTITIONS)); + assertEquals(9, partitioner.getPartition(new VariantLocusKey("Z",0), null, NUM_PARTITIONS)); + assertEquals(9, partitioner.getPartition(new VariantLocusKey("Z_random_contig",0), null, NUM_PARTITIONS)); + } + +} \ No newline at end of file diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyTest.java new file mode 100644 index 00000000000..5263d749975 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyTest.java @@ -0,0 +1,152 @@ +package org.opencb.opencga.storage.hadoop.variant.mr; + +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.opencb.opencga.core.testclassification.duration.ShortTests; + +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.*; + +@Category(ShortTests.class) +public class VariantLocusKeyTest { + + @Test + public void shouldReturnTrueForEqualVariantLocusKeys() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000); + VariantLocusKey key2 = new VariantLocusKey("1", 1000); + assertTrue(key1.equals(key2)); + } + + @Test + public void shouldReturnFalseForDifferentVariantLocusKeys() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000); + VariantLocusKey key2 = new VariantLocusKey("2", 1000); + assertFalse(key1.equals(key2)); + } + + @Test + public void shouldReturnFalseForNullVariantLocusKey() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000); + assertFalse(key1.equals(null)); + } + + @Test + public void shouldReturnFalseForDifferentObjectType() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000); + String otherObject = "someString"; + assertFalse(key1.equals(otherObject)); + } + + @Test + public void shouldReturnConsistentHashCodeForEqualVariantLocusKeys() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000); + VariantLocusKey key2 = new VariantLocusKey("1", 1000); + assertEquals(key1.hashCode(), key2.hashCode()); + } + + @Test + public void shouldReturnDifferentHashCodeForDifferentVariantLocusKeys() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000); + VariantLocusKey key2 = new VariantLocusKey("2", 1000); + assertNotEquals(key1.hashCode(), key2.hashCode()); + } + + @Test + public void shouldReturnZeroForEqualVariantLocusKeys() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000, "A"); + VariantLocusKey key2 = new VariantLocusKey("1", 1000, "A"); + assertEquals(0, key1.compareTo(key2)); + } + + @Test + public void shouldReturnNegativeForSmallerChromosome() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000, "A"); + VariantLocusKey key2 = new VariantLocusKey("2", 1000, "A"); + assertTrue(key1.compareTo(key2) < 0); + } + + @Test + public void shouldReturnPositiveForLargerChromosome() { + VariantLocusKey key1 = new VariantLocusKey("2", 1000, "A"); + VariantLocusKey key2 = new VariantLocusKey("1", 1000, "A"); + assertTrue(key1.compareTo(key2) > 0); + } + + @Test + public void shouldReturnNegativeForSmallerPosition() { + VariantLocusKey key1 = new VariantLocusKey("1", 999, "A"); + VariantLocusKey key2 = new VariantLocusKey("1", 1000, "A"); + assertTrue(key1.compareTo(key2) < 0); + } + + @Test + public void shouldReturnPositiveForLargerPosition() { + VariantLocusKey key1 = new VariantLocusKey("1", 1001, "A"); + VariantLocusKey key2 = new VariantLocusKey("1", 1000, "A"); + assertTrue(key1.compareTo(key2) > 0); + } + + @Test + public void shouldReturnNegativeForSmallerOther() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000, "A"); + VariantLocusKey key2 = new VariantLocusKey("1", 1000, "B"); + assertTrue(key1.compareTo(key2) < 0); + } + + @Test + public void shouldReturnPositiveForLargerOther() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000, "B"); + VariantLocusKey key2 = new VariantLocusKey("1", 1000, "A"); + assertTrue(key1.compareTo(key2) > 0); + } + + @Test + public void shouldReturnZeroWhenBothOtherAreNull() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000, null); + VariantLocusKey key2 = new VariantLocusKey("1", 1000, null); + assertEquals(0, key1.compareTo(key2)); + } + + @Test + public void shouldReturnNegativeWhenOtherIsNull() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000, null); + VariantLocusKey key2 = new VariantLocusKey("1", 1000, "A"); + assertTrue(key1.compareTo(key2) < 0); + } + + @Test + public void shouldReturnPositiveWhenOtherIsNotNull() { + VariantLocusKey key1 = new VariantLocusKey("1", 1000, "A"); + VariantLocusKey key2 = new VariantLocusKey("1", 1000, null); + assertTrue(key1.compareTo(key2) > 0); + } + + @Test + public void shouldCompareChromosomesCorrectly() { + List keys = Arrays.asList( + new VariantLocusKey("1", 1000, "A"), + new VariantLocusKey("1_random", 1000, "A"), + new VariantLocusKey("2", 1000, "A"), + new VariantLocusKey("9", 1000, "A"), + new VariantLocusKey("10", 1000, "A"), + new VariantLocusKey("10_random", 1000, "A"), + new VariantLocusKey("19", 1000, "A"), + new VariantLocusKey("20", 1000, "A"), + new VariantLocusKey("22", 1000, "A"), + new VariantLocusKey("X", 1000, "A"), + new VariantLocusKey("Y", 1000, "A") + ); + + VariantLocusKey prevKey = null; + for (VariantLocusKey key : keys) { + if (prevKey == null) { + prevKey = key; + } else { + assertTrue(prevKey + " < " + key, prevKey.compareTo(key) < 0); + prevKey = key; + } + } + } +} \ No newline at end of file From 0df69dcc8fd540110fb8fb281da30ecc6a227070 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 31 Oct 2024 13:51:13 +0000 Subject: [PATCH 21/66] storage: Use VariantLocusKey and VariantLocusPartitioner in VariantExportDirver. #TASK-6722 --- .../core/variant/io/VariantWriterFactory.java | 23 +++++++++++--- .../hadoop/utils/AbstractHBaseDriver.java | 5 +-- .../variant/io/HadoopVariantExporter.java | 4 +-- .../variant/io/VariantExporterDriver.java | 31 +++++++++++++------ .../variant/mr/StreamVariantMapper.java | 6 ++-- 5 files changed, 49 insertions(+), 20 deletions(-) diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/VariantWriterFactory.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/VariantWriterFactory.java index 61c2e6552d5..157a9a8465a 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/VariantWriterFactory.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/VariantWriterFactory.java @@ -76,11 +76,11 @@ public enum VariantOutputFormat { VCF_GZ("vcf.gz", false), JSON("json"), JSON_GZ("json.gz"), - AVRO("avro"), - AVRO_GZ("avro.gz"), - AVRO_SNAPPY("avro.snappy"), - PARQUET("parquet"), - PARQUET_GZ("parquet.gz"), + AVRO("avro", true, true), + AVRO_GZ("avro.gz", true, true), + AVRO_SNAPPY("avro.snappy", true, true), + PARQUET("parquet", true, true), + PARQUET_GZ("parquet.gz", true, true), STATS("stats.tsv", false), STATS_GZ("stats.tsv.gz", false), CELLBASE("frequencies.json"), @@ -90,16 +90,25 @@ public enum VariantOutputFormat { ENSEMBL_VEP_GZ("vep.txt.gz", false); private final boolean multiStudy; + private final boolean binary; private final String extension; VariantOutputFormat(String extension) { this.extension = extension; this.multiStudy = true; + this.binary = false; } VariantOutputFormat(String extension, boolean multiStudy) { this.multiStudy = multiStudy; this.extension = extension; + this.binary = false; + } + + VariantOutputFormat(String extension, boolean multiStudy, boolean binary) { + this.multiStudy = multiStudy; + this.extension = extension; + this.binary = binary; } public String getExtension() { @@ -122,6 +131,10 @@ public boolean isSnappy() { return extension.endsWith(".snappy"); } + public boolean isBinary() { + return binary; + } + public VariantOutputFormat inPlain() { if (!isPlain()) { return VariantOutputFormat.valueOf(name().replace("_GZ", "").replace("_SNAPPY", "")); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index e86a8dd3fb7..8ee2092ce0a 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -477,7 +477,7 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool LOGGER.info(" Target : {}", localOutput.toUri()); fileSystem.copyToLocalFile(false, paths.get(0), localOutput); } else { - LOGGER.info("Concat and copy to local " + paths.size()); + LOGGER.info("Concat and copy to local : " + paths.size() + " partial files"); LOGGER.info(" Source : " + mrOutdir.toUri()); LOGGER.info(" Target : " + localOutput.toUri()); LOGGER.info(" ---- "); @@ -487,7 +487,8 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool OutputStream os = gzOs == null ? fsOs : gzOs; for (int i = 0; i < paths.size(); i++) { Path path = paths.get(i); - LOGGER.info("Concat {} : '{}' ({}) ", + LOGGER.info("[{}] Concat {} : '{}' ({}) ", + i, isGzip ? "gzip file" : "file", path.toUri(), humanReadableByteCount(fileSystem.getFileStatus(path).getLen(), false)); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java index b5b0cb0e6e0..2bc76ab5e99 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java @@ -180,13 +180,13 @@ public List export(@Nullable URI outputFileUri, VariantWriterFactory.Varian logger.info("Query for approximately {} of {} variants, which is {}% of the total." + " Consider small query." + " Skip MapReduce", - count, totalCount, matchRate * 100); + count, totalCount, String.format("%.2f", matchRate * 100)); smallQuery = true; } else { logger.info("Query for approximately {} of {} variants, which is {}% of the total." + " Current variants threshold is {}, and matchRatioThreshold is {}% ." + " Not a small query", - count, totalCount, matchRate * 100, variantsThreshold, matchRatioThreshold); + count, totalCount, String.format("%.2f", matchRate * 100), variantsThreshold, matchRatioThreshold); } } catch (VariantSearchException e) { logger.info("Unable to count variants from SearchEngine", e); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java index c44e686e4d2..d2489fac1c7 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java @@ -12,6 +12,7 @@ import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.JobContext; import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; @@ -25,6 +26,8 @@ import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver; import org.opencb.opencga.storage.hadoop.variant.mr.VariantFileOutputFormat; +import org.opencb.opencga.storage.hadoop.variant.mr.VariantLocusKey; +import org.opencb.opencga.storage.hadoop.variant.mr.VariantLocusKeyPartitioner; import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapper; import java.io.IOException; @@ -41,6 +44,7 @@ public class VariantExporterDriver extends VariantDriver { private Class mapperClass; private Class reducerClass; private Class outputFormatClass; + private Class partitioner; @Override protected void parseAndValidateParameters() throws IOException { @@ -59,6 +63,11 @@ protected Class getReducerClass() { return reducerClass; } + @Override + protected Class getPartitioner() { + return partitioner; + } + @Override protected Class getOutputFormatClass() { return outputFormatClass; @@ -76,7 +85,7 @@ protected void setupJob(Job job) throws IOException { case AVRO: outputFormatClass = AvroKeyOutputFormat.class; if (useReduceStep) { - job.setMapOutputKeyClass(NullWritable.class); + job.setMapOutputKeyClass(VariantLocusKey.class); AvroJob.setMapOutputValueSchema(job, VariantAvro.getClassSchema()); AvroJob.setOutputKeySchema(job, VariantAvro.getClassSchema()); job.setOutputValueClass(NullWritable.class); @@ -108,11 +117,15 @@ protected void setupJob(Job job) throws IOException { } break; default: + if (outputFormat.isBinary()) { + throw new IllegalArgumentException("Unexpected binary output format " + outputFormat); + } if (useReduceStep) { - job.setMapOutputKeyClass(NullWritable.class); + job.setMapOutputKeyClass(VariantLocusKey.class); AvroJob.setMapOutputValueSchema(job, VariantAvro.getClassSchema()); mapperClass = AvroVariantExporterMapper.class; reducerClass = VariantExporterReducer.class; + partitioner = VariantLocusKeyPartitioner.class; } else { AvroJob.setOutputKeySchema(job, VariantAvro.getClassSchema()); mapperClass = VariantExporterDirectMapper.class; @@ -182,7 +195,7 @@ protected void map(Object key, Variant value, Context context) throws IOExceptio * @see VariantExporterReducer * @see AvroKeyVariantExporterReducer */ - public static class AvroVariantExporterMapper extends VariantMapper> { + public static class AvroVariantExporterMapper extends VariantMapper> { @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); @@ -193,7 +206,7 @@ protected void setup(Context context) throws IOException, InterruptedException { protected void map(Object key, Variant value, Context context) throws IOException, InterruptedException { context.getCounter(COUNTER_GROUP_NAME, "variants").increment(1); removeNullsFromAvro(value.getImpl(), context); - context.write(NullWritable.get(), new AvroValue<>(value.getImpl())); + context.write(new VariantLocusKey(value), new AvroValue<>(value.getImpl())); } } @@ -203,9 +216,9 @@ protected void map(Object key, Variant value, Context context) throws IOExceptio * @see AvroVariantExporterMapper * @see VariantWriterFactory.VariantOutputFormat */ - public static class VariantExporterReducer extends Reducer, Variant, NullWritable> { + public static class VariantExporterReducer extends Reducer, Variant, NullWritable> { @Override - protected void reduce(NullWritable key, Iterable> values, Context context) + protected void reduce(T key, Iterable> values, Context context) throws IOException, InterruptedException { for (AvroValue value : values) { context.write(new Variant(value.datum()), NullWritable.get()); @@ -219,10 +232,10 @@ protected void reduce(NullWritable key, Iterable> values, * @see AvroVariantExporterMapper * @see AvroKeyOutputFormat */ - public static class AvroKeyVariantExporterReducer - extends Reducer, AvroKey, NullWritable> { + public static class AvroKeyVariantExporterReducer + extends Reducer, AvroKey, NullWritable> { @Override - protected void reduce(NullWritable key, Iterable> values, Context context) + protected void reduce(T key, Iterable> values, Context context) throws IOException, InterruptedException { for (AvroValue value : values) { context.write(new AvroKey<>(value.datum()), NullWritable.get()); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 03c3aa6b0b0..048c3455e17 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -469,7 +469,8 @@ public void run() { private void write(Text line) throws IOException, InterruptedException { numRecords++; - context.write(new VariantLocusKey(currentChromosome, currentPosition, StreamVariantReducer.STDOUT_KEY + (stdoutKeyNum++)), line); + context.write(new VariantLocusKey(currentChromosome, currentPosition, + StreamVariantReducer.STDOUT_KEY + (stdoutKeyNum++)), line); } } @@ -550,7 +551,8 @@ private void write(String line) throws IOException, InterruptedException { } private void write(Text line) throws IOException, InterruptedException { - context.write(new VariantLocusKey(currentChromosome, currentPosition, StreamVariantReducer.STDERR_KEY + (stderrKeyNum++)), line); + context.write(new VariantLocusKey(currentChromosome, currentPosition, + StreamVariantReducer.STDERR_KEY + (stderrKeyNum++)), line); } private boolean matchesReporter(String line) { From f6fd3d46642b763c84075d0f7799a0fd0ca26add Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 1 Nov 2024 09:47:07 +0000 Subject: [PATCH 22/66] storage: Fix VariantLocusKey serialization. #TASK-6722 --- .../hadoop/utils/AbstractHBaseDriver.java | 41 +++++++++++++++++-- .../variant/mr/StreamVariantMapper.java | 10 ++--- .../variant/mr/StreamVariantReducer.java | 8 +++- .../hadoop/variant/mr/VariantLocusKey.java | 11 +++-- .../variant/mr/VariantLocusKeyTest.java | 26 ++++++++++++ 5 files changed, 80 insertions(+), 16 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index 8ee2092ce0a..99a119264c3 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -6,6 +6,7 @@ import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.*; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.mapreduce.TableInputFormat; @@ -19,6 +20,10 @@ import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.yarn.api.records.ApplicationId; +import org.apache.hadoop.yarn.api.records.ApplicationReport; +import org.apache.hadoop.yarn.api.records.YarnApplicationState; +import org.apache.hadoop.yarn.client.api.YarnClient; +import org.apache.hadoop.yarn.exceptions.YarnException; import org.apache.parquet.hadoop.ParquetFileWriter; import org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil; import org.opencb.commons.datastore.core.ObjectMap; @@ -34,11 +39,9 @@ import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.function.Supplier; +import java.util.stream.Collectors; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; @@ -193,6 +196,7 @@ public final int run(String[] args) throws Exception { LOGGER.info(" - Outdir : " + job.getConfiguration().get(FileOutputFormat.OUTDIR)); } LOGGER.info("================================================="); + reportRunningJobs(); boolean succeed = executeJob(job); if (!succeed) { LOGGER.error("error with job!"); @@ -215,6 +219,32 @@ public final int run(String[] args) throws Exception { return succeed ? 0 : 1; } + private void reportRunningJobs() { + // Get the number of pending or running jobs in yarn + try (YarnClient yarnClient = YarnClient.createYarnClient()) { + yarnClient.init(getConf()); + yarnClient.start(); + + List applications = yarnClient.getApplications(EnumSet.of( + YarnApplicationState.NEW, + YarnApplicationState.NEW_SAVING, + YarnApplicationState.SUBMITTED, + YarnApplicationState.ACCEPTED, + YarnApplicationState.RUNNING)); + if (applications.isEmpty()) { + LOGGER.info("No pending or running jobs in yarn"); + } else { + LOGGER.info("Found " + applications.size() + " pending or running jobs in yarn"); + for (Map.Entry> entry : applications.stream() + .collect(Collectors.groupingBy(ApplicationReport::getYarnApplicationState)).entrySet()) { + LOGGER.info(" * " + entry.getKey() + " : " + entry.getValue().size()); + } + } + } catch (IOException | YarnException e) { + LOGGER.error("Error getting list of pending jobs from YARN", e); + } + } + private boolean configFromArgs(String[] args) { int fixedSizeArgs = getFixedSizeArgs(); @@ -468,6 +498,8 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool } } } + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); if (paths.isEmpty()) { LOGGER.warn("The MapReduce job didn't produce any output. This may not be expected."); } else if (paths.size() == 1) { @@ -517,6 +549,7 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool } } LOGGER.info("File size : " + humanReadableByteCount(Files.size(Paths.get(localOutput.toUri())), false)); + LOGGER.info("Time to copy from HDFS and concat : " + TimeUtils.durationToString(stopWatch)); } return paths; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 048c3455e17..f0a1de0f73e 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -131,12 +131,12 @@ public void run(Context context) throws IOException, InterruptedException { // or if the chromosome changes if (processedBytes > maxInputBytesPerProcess) { LOG.info("Processed bytes = " + processedBytes + " > " + maxInputBytesPerProcess + ". Restarting process."); - restartProcess(context, "BYTES_LIMIT"); + restartProcess(context, "bytes_limit"); } else if (!currentChromosome.equals(currentValue.getChromosome())) { // TODO: Should we change only when the chromosome change would produce a partition change? LOG.info("Chromosome changed from " + currentChromosome + " to " + currentValue.getChromosome() + ". Restarting process."); - restartProcess(context, "CHR_CHANGE"); + restartProcess(context, "chr_change"); } map(context.getCurrentKey(), currentValue, context); } while (!hasExceptions() && context.nextKeyValue()); @@ -169,14 +169,14 @@ public void run(Context context) throws IOException, InterruptedException { addException(th); } } else { - context.getCounter(COUNTER_GROUP_NAME, "EMPTY_INPUT_SPLIT").increment(1); + context.getCounter(COUNTER_GROUP_NAME, "empty_input_split").increment(1); } throwExceptionIfAny(); } private void restartProcess(Mapper.Context context, String reason) throws IOException, InterruptedException, StorageEngineException { - context.getCounter(COUNTER_GROUP_NAME, "RESTARTED_PROCESS_" + reason).increment(1); + context.getCounter(COUNTER_GROUP_NAME, "restarted_process_" + reason).increment(1); closeProcess(context); startProcess(context); } @@ -332,7 +332,7 @@ private void closeProcess(Context context) throws IOException, InterruptedExcept private void startProcess(Context context) throws IOException, StorageEngineException, InterruptedException { LOG.info("bash -ce '" + commandLine + "'"); - context.getCounter(COUNTER_GROUP_NAME, "START_PROCESS").increment(1); + context.getCounter(COUNTER_GROUP_NAME, "start_process").increment(1); Variant variant = context.getCurrentValue(); currentChromosome = variant.getChromosome(); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java index 25598e593f9..c10bbcb2591 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java @@ -57,9 +57,15 @@ protected void reduce(VariantLocusKey key, Iterable values, context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "body_records").increment(1); } context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stdout_records").increment(1); - } else { + context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stdout_records_bytes") + .increment(value.getLength()); + } else if (key.getOther().startsWith(STDERR_KEY)) { mos.write("stderr", key, value); context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stderr_records").increment(1); + context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stderr_records_bytes") + .increment(value.getLength()); + } else { + throw new IllegalStateException("Unknown key " + key); } context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "records").increment(1); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKey.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKey.java index a198e03de5c..ce6d4926120 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKey.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKey.java @@ -71,21 +71,20 @@ public static boolean isSingleDigitChromosome(String chromosome) { @Override public void write(DataOutput out) throws IOException { - out.writeChars(chromosome); - out.writeChars("\n"); + out.writeUTF(chromosome); out.writeInt(position); if (other != null) { - out.writeChars(other); + out.writeUTF(other); } else { - out.writeChars(""); + out.writeUTF(""); } } @Override public void readFields(DataInput in) throws IOException { - chromosome = in.readLine(); + chromosome = in.readUTF(); position = in.readInt(); - other = in.readLine(); + other = in.readUTF(); } public String getChromosome() { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyTest.java index 5263d749975..74552d1f241 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyTest.java @@ -4,6 +4,7 @@ import org.junit.experimental.categories.Category; import org.opencb.opencga.core.testclassification.duration.ShortTests; +import java.io.*; import java.util.Arrays; import java.util.List; @@ -149,4 +150,29 @@ public void shouldCompareChromosomesCorrectly() { } } } + + @Test + public void testWriteAndRead() throws IOException { + testWriteAndRead(new VariantLocusKey("1_random", 1000, "A")); + testWriteAndRead(new VariantLocusKey("1", 3541316, "O:31231")); + testWriteAndRead(new VariantLocusKey("0", 3541316, "O:31231")); + testWriteAndRead(new VariantLocusKey("", 3541316, "")); + testWriteAndRead(new VariantLocusKey("", -2, "")); + } + + private static void testWriteAndRead(VariantLocusKey originalKey) throws IOException { + // Write the object to a byte array output stream + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream); + originalKey.write(dataOutputStream); + + // Read the object from a byte array input stream + ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(byteArrayOutputStream.toByteArray()); + DataInputStream dataInputStream = new DataInputStream(byteArrayInputStream); + VariantLocusKey readKey = new VariantLocusKey(); + readKey.readFields(dataInputStream); + + // Assert that the read object is equal to the original object + assertEquals(originalKey, readKey); + } } \ No newline at end of file From fa3c9f2a17ddd8a5492abf4fcdfd5283b5c5924e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Mon, 4 Nov 2024 15:27:26 +0000 Subject: [PATCH 23/66] storage: Fix "Request body si too large" #TASK-6722 --- .../variant/io/json/VariantJsonWriter.java | 6 +- .../variant/io/MaxWriteBlockOutputStream.java | 42 +++++++++ .../variant/mr/VariantFileOutputFormat.java | 14 ++- .../io/MaxWriteBlockOutputStreamTest.java | 85 +++++++++++++++++++ 4 files changed, 138 insertions(+), 9 deletions(-) create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/MaxWriteBlockOutputStream.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/io/MaxWriteBlockOutputStreamTest.java diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/json/VariantJsonWriter.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/json/VariantJsonWriter.java index 6930ef05b02..f8828b83c31 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/json/VariantJsonWriter.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/json/VariantJsonWriter.java @@ -186,7 +186,11 @@ public boolean post() { fileGenerator.flush(); } } catch (IOException ex) { - close(); + try { + close(); + } catch (Exception ex1) { + ex.addSuppressed(ex1); + } throw new UncheckedIOException(ex); } return true; diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/MaxWriteBlockOutputStream.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/MaxWriteBlockOutputStream.java new file mode 100644 index 00000000000..55b4f82acce --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/MaxWriteBlockOutputStream.java @@ -0,0 +1,42 @@ +package org.opencb.opencga.storage.hadoop.variant.io; + +import java.io.FilterOutputStream; +import java.io.IOException; +import java.io.OutputStream; + +/** + * MaxWriteBlockOutputStream is a {@link FilterOutputStream} that writes blocks of a maximum size. + *

+ * If the block size is greater than the maximum block size, it will split the block in smaller blocks of the maximum size. + *

+ * This class is used to avoid writing large blocks into Azure Blob Storage. Azure Blob Storage has a limit of 4MB per block. + * See + * Request body too large. + */ +public class MaxWriteBlockOutputStream extends FilterOutputStream { + + private final int maxBlockSize; + + public MaxWriteBlockOutputStream(OutputStream out) { + this(out, 1024 * 1024 * 2); + } + + public MaxWriteBlockOutputStream(OutputStream out, int maxBlockSize) { + super(out); + this.maxBlockSize = maxBlockSize; + } + + @Override + public synchronized void write(byte[] b, int off, int len) throws IOException { + if (len > maxBlockSize) { + int start = 0; + while (start < len) { + int blockLength = Math.min(maxBlockSize, len - start); + out.write(b, off + start, blockLength); + start += blockLength; + } + } else { + out.write(b, off, len); + } + } +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java index 0903d498b86..5cc41ce2eb1 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java @@ -34,11 +34,10 @@ import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat; +import org.opencb.opencga.storage.hadoop.variant.io.MaxWriteBlockOutputStream; import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseVariantStorageMetadataDBAdaptorFactory; -import java.io.DataOutputStream; -import java.io.IOException; -import java.io.OutputStream; +import java.io.*; /** @@ -67,13 +66,12 @@ public RecordWriter getRecordWriter(TaskAttemptContext jo } Path file = this.getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); - FSDataOutputStream fileOut = fs.create(file, false); + OutputStream out = fs.create(file, false); if (isCompressed) { - DataOutputStream out = new DataOutputStream(codec.createOutputStream(fileOut)); - return new VariantRecordWriter(configureWriter(job, out), out); - } else { - return new VariantRecordWriter(configureWriter(job, fileOut), fileOut); + out = new DataOutputStream(codec.createOutputStream(out)); } + out = new MaxWriteBlockOutputStream(out); + return new VariantRecordWriter(configureWriter(job, out), out); } private DataWriter configureWriter(final TaskAttemptContext job, OutputStream fileOut) throws IOException { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/io/MaxWriteBlockOutputStreamTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/io/MaxWriteBlockOutputStreamTest.java new file mode 100644 index 00000000000..21712f0d1f5 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/io/MaxWriteBlockOutputStreamTest.java @@ -0,0 +1,85 @@ +package org.opencb.opencga.storage.hadoop.variant.io; + +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.mockito.Mockito; +import org.opencb.opencga.core.testclassification.duration.ShortTests; + +import java.io.*; +import java.util.Random; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertThrows; + +@Category(ShortTests.class) +public class MaxWriteBlockOutputStreamTest { + + @Test + public void shouldWriteAndReadDataCorrectly() throws IOException { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + MaxWriteBlockOutputStream outputStream = new MaxWriteBlockOutputStream(byteArrayOutputStream); + + byte[] data = "test data".getBytes(); + outputStream.write(data, 0, data.length); + outputStream.flush(); + + ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(byteArrayOutputStream.toByteArray()); + DataInputStream dataInputStream = new DataInputStream(byteArrayInputStream); + byte[] readData = new byte[data.length]; + dataInputStream.readFully(readData); + + assertArrayEquals(data, readData); + } + + @Test + public void shouldHandleEmptyData() throws IOException { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + MaxWriteBlockOutputStream outputStream = new MaxWriteBlockOutputStream(byteArrayOutputStream); + + byte[] data = new byte[0]; + outputStream.write(data, 0, data.length); + outputStream.flush(); + + ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(byteArrayOutputStream.toByteArray()); + DataInputStream dataInputStream = new DataInputStream(byteArrayInputStream); + byte[] readData = new byte[data.length]; + dataInputStream.readFully(readData); + + assertArrayEquals(data, readData); + } + + @Test + public void shouldHandleLargeData() throws IOException { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + byteArrayOutputStream = Mockito.spy(byteArrayOutputStream); + Mockito.verify(byteArrayOutputStream, Mockito.never()).write(Mockito.any(byte[].class), Mockito.anyInt(), Mockito.anyInt()); + MaxWriteBlockOutputStream outputStream = new MaxWriteBlockOutputStream(byteArrayOutputStream, 1024); + + byte[] data = new byte[1024 * 1024]; // 1 MB of data + new Random().nextBytes(data); + outputStream.write(data, 0, data.length); + outputStream.flush(); + + // Check that the write method was called multiple times + Mockito.verify(byteArrayOutputStream, Mockito.times(1024)).write(Mockito.any(byte[].class), Mockito.anyInt(), Mockito.anyInt()); + Mockito.verify(byteArrayOutputStream, Mockito.never()).write(Mockito.any(byte[].class)); + + ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(byteArrayOutputStream.toByteArray()); + DataInputStream dataInputStream = new DataInputStream(byteArrayInputStream); + byte[] readData = new byte[data.length]; + dataInputStream.readFully(readData); + + assertArrayEquals(data, readData); + } + + @Test + public void shouldThrowExceptionForNullData() { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + MaxWriteBlockOutputStream outputStream = new MaxWriteBlockOutputStream(byteArrayOutputStream); + + assertThrows(NullPointerException.class, () -> { + outputStream.write(null, 0, 0); + }); + } + +} \ No newline at end of file From b528c033c962360c2a6f1787e695cba6e1ec8aca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Mon, 4 Nov 2024 15:41:29 +0000 Subject: [PATCH 24/66] analysis: Do not try to close twice the same ERM. #TASK-6722 --- .../opencga/analysis/tools/OpenCgaTool.java | 36 +++++++++++-------- .../variant/mr/VariantFileOutputFormat.java | 5 +-- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/tools/OpenCgaTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/tools/OpenCgaTool.java index 6024761d596..b5e01687b5f 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/tools/OpenCgaTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/tools/OpenCgaTool.java @@ -192,17 +192,11 @@ public final ExecutionResult start() throws ToolException { if (!erm.isClosed()) { String message = "Unexpected system shutdown. Job killed by the system."; privateLogger.error(message); + if (exception == null) { + exception = new RuntimeException(message); + } try { - if (scratchDir != null) { - deleteScratchDirectory(); - } - if (exception == null) { - exception = new RuntimeException(message); - } - logException(exception); - ExecutionResult result = erm.close(exception); - privateLogger.info("------- Tool '" + getId() + "' executed in " - + TimeUtils.durationToString(result.getEnd().getTime() - result.getStart().getTime()) + " -------"); + close(exception); } catch (ToolException e) { privateLogger.error("Error closing ExecutionResult", e); } @@ -271,13 +265,25 @@ public final ExecutionResult start() throws ToolException { } throw e; } finally { + // If the shutdown hook has been executed, the ExecutionResultManager is already closed + if (!erm.isClosed()) { + result = close(exception); + } else { + result = erm.read(); + } + } + return result; + } + + private ExecutionResult close(Throwable exception) throws ToolException { + if (scratchDir != null) { deleteScratchDirectory(); - stopMemoryMonitor(); - result = erm.close(exception); - logException(exception); - privateLogger.info("------- Tool '" + getId() + "' executed in " - + TimeUtils.durationToString(result.getEnd().getTime() - result.getStart().getTime()) + " -------"); } + logException(exception); + stopMemoryMonitor(); + ExecutionResult result = erm.close(exception); + privateLogger.info("------- Tool '" + getId() + "' executed in " + + TimeUtils.durationToString(result.getEnd().getTime() - result.getStart().getTime()) + " -------"); return result; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java index 5cc41ce2eb1..3a4a2f7293b 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java @@ -17,7 +17,6 @@ package org.opencb.opencga.storage.hadoop.variant.mr; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; @@ -37,7 +36,9 @@ import org.opencb.opencga.storage.hadoop.variant.io.MaxWriteBlockOutputStream; import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseVariantStorageMetadataDBAdaptorFactory; -import java.io.*; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.OutputStream; /** From 96e56795afdfaf7597d1648c1cd785da0a0a78d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 7 Nov 2024 14:30:41 +0000 Subject: [PATCH 25/66] storage: Do not use flush on outputstream. HADOOP-16548 #TASK-6722 --- .../variant/io/json/VariantJsonWriter.java | 8 ++--- .../variant/io/CountingOutputStream.java | 36 +++++++++++++++++++ .../variant/mr/VariantFileOutputFormat.java | 10 +++--- 3 files changed, 46 insertions(+), 8 deletions(-) create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/CountingOutputStream.java diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/json/VariantJsonWriter.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/json/VariantJsonWriter.java index f8828b83c31..2a8437099bf 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/json/VariantJsonWriter.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/io/json/VariantJsonWriter.java @@ -177,13 +177,13 @@ public boolean write(Variant variant) { @Override public boolean post() { try { - variantsStream.flush(); - variantsGenerator.flush(); +// variantsStream.flush(); +// variantsGenerator.flush(); if (fileGenerator != null) { fileGenerator.writeObject(fileMetadata); - fileStream.flush(); - fileGenerator.flush(); +// fileStream.flush(); +// fileGenerator.flush(); } } catch (IOException ex) { try { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/CountingOutputStream.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/CountingOutputStream.java new file mode 100644 index 00000000000..93f3dcd9bf8 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/CountingOutputStream.java @@ -0,0 +1,36 @@ +package org.opencb.opencga.storage.hadoop.variant.io; + +import java.io.FilterOutputStream; +import java.io.IOException; +import java.io.OutputStream; + +public class CountingOutputStream extends FilterOutputStream { + + private long count = 0; + + public CountingOutputStream(OutputStream os) { + super(os); + } + + @Override + public void write(int b) throws IOException { + out.write(b); + count++; + } + + @Override + public void write(byte[] b) throws IOException { + out.write(b); + count += b.length; + } + + @Override + public void write(byte[] b, int off, int len) throws IOException { + out.write(b, off, len); + count += len; + } + + public long getByteCount() { + return count; + } +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java index 3a4a2f7293b..16dd0fffa3f 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java @@ -33,13 +33,15 @@ import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat; -import org.opencb.opencga.storage.hadoop.variant.io.MaxWriteBlockOutputStream; +import org.opencb.opencga.storage.hadoop.variant.io.CountingOutputStream; import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseVariantStorageMetadataDBAdaptorFactory; import java.io.DataOutputStream; import java.io.IOException; import java.io.OutputStream; +import static org.opencb.opencga.storage.hadoop.variant.mr.VariantsTableMapReduceHelper.COUNTER_GROUP_NAME; + /** * Writes variants into any format supported by the {@link VariantWriterFactory}. @@ -71,7 +73,6 @@ public RecordWriter getRecordWriter(TaskAttemptContext jo if (isCompressed) { out = new DataOutputStream(codec.createOutputStream(out)); } - out = new MaxWriteBlockOutputStream(out); return new VariantRecordWriter(configureWriter(job, out), out); } @@ -100,11 +101,11 @@ private DataWriter configureWriter(final TaskAttemptContext job, Output protected static class VariantRecordWriter extends RecordWriter { private final DataWriter writer; - private final OutputStream outputStream; + private final CountingOutputStream outputStream; public VariantRecordWriter(DataWriter writer, OutputStream outputStream) { this.writer = writer; - this.outputStream = outputStream; + this.outputStream = new CountingOutputStream(outputStream); } @Override @@ -117,6 +118,7 @@ public void close(TaskAttemptContext taskAttemptContext) throws IOException, Int writer.post(); writer.close(); outputStream.close(); + taskAttemptContext.getCounter(COUNTER_GROUP_NAME, "bytes_written").increment(outputStream.getByteCount()); } } From bcd8185f1aab1121150524c3f7425cd023a020a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 7 Nov 2024 14:48:01 +0000 Subject: [PATCH 26/66] storage: Add VariantExporterDirectMultipleOutputsMapper to ensure sorted export without reduce step. #TASK-6722 --- .../hadoop/utils/AbstractHBaseDriver.java | 57 ++++++++++-- .../variant/executors/SshMRExecutor.java | 3 +- .../hadoop/variant/io/VariantDriver.java | 37 ++------ .../variant/io/VariantExporterDriver.java | 87 +++++++++++++++++-- .../variant/mr/StreamVariantDriver.java | 21 ++--- .../variant/mr/StreamVariantReducer.java | 6 +- .../utils/HBaseVariantTableNameGenerator.java | 4 + .../variant/HadoopVariantStorageTest.java | 1 + 8 files changed, 155 insertions(+), 61 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index 99a119264c3..7a069d6efa3 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -31,11 +31,13 @@ import org.opencb.opencga.core.common.TimeUtils; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.hadoop.io.HDFSIOConnector; +import org.opencb.opencga.storage.hadoop.variant.executors.SshMRExecutor; import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; +import java.net.URI; import java.nio.charset.Charset; import java.nio.file.Files; import java.nio.file.Paths; @@ -220,6 +222,10 @@ public final int run(String[] args) throws Exception { } private void reportRunningJobs() { + if (getConf().getBoolean("storage.hadoop.mr.skipReportRunningJobs", false)) { + LOGGER.info("Skip report running jobs"); + return; + } // Get the number of pending or running jobs in yarn try (YarnClient yarnClient = YarnClient.createYarnClient()) { yarnClient.init(getConf()); @@ -362,10 +368,20 @@ protected Path getTempOutdir(String prefix, String suffix, boolean ensureHdfs) t } } } - LOGGER.info("Temporary directory: " + tmpDir.toUri()); + LOGGER.info("Temporary directory: " + toUri(tmpDir)); return new Path(tmpDir, fileName); } + private URI toUri(Path path) throws IOException { + URI tmpUri = path.toUri(); + if (tmpUri.getScheme() == null) { + // If the scheme is null, add the default scheme + FileSystem fileSystem = path.getFileSystem(getConf()); + tmpUri = fileSystem.getUri().resolve(tmpUri.getPath()); + } + return tmpUri; + } + protected Path getLocalOutput(Path outdir) throws IOException { return getLocalOutput(outdir, () -> null); } @@ -408,13 +424,23 @@ public class MapReduceOutputFile { private final Supplier nameGenerator; private final String tempFilePrefix; + private final Map extraFiles = new HashMap<>(); + private String namedOutput; protected Path localOutput; protected Path outdir; + public MapReduceOutputFile(String tempFilePrefix) throws IOException { + this.nameGenerator = () -> null; + this.tempFilePrefix = tempFilePrefix; + getOutputPath(); + namedOutput = null; + } + public MapReduceOutputFile(Supplier nameGenerator, String tempFilePrefix) throws IOException { this.nameGenerator = nameGenerator; this.tempFilePrefix = tempFilePrefix; getOutputPath(); + namedOutput = null; } protected void getOutputPath() throws IOException { @@ -428,10 +454,10 @@ protected void getOutputPath() throws IOException { outdir.getFileSystem(getConf()).deleteOnExit(outdir); } if (localOutput != null) { - LOGGER.info(" * Outdir file: " + localOutput.toUri()); - LOGGER.info(" * Temporary outdir file: " + outdir.toUri()); + LOGGER.info(" * Outdir file: " + toUri(localOutput)); + LOGGER.info(" * Temporary outdir file: " + toUri(outdir)); } else { - LOGGER.info(" * Outdir file: " + outdir.toUri()); + LOGGER.info(" * Outdir file: " + toUri(outdir)); } } } @@ -439,7 +465,7 @@ protected void getOutputPath() throws IOException { public void postExecute(boolean succeed) throws IOException { if (succeed) { if (localOutput != null) { - concatMrOutputToLocal(outdir, localOutput); + getConcatMrOutputToLocal(); } } if (localOutput != null) { @@ -447,6 +473,27 @@ public void postExecute(boolean succeed) throws IOException { } } + public MapReduceOutputFile setNamedOutput(String partFilePrefix) { + this.namedOutput = partFilePrefix; + return this; + } + + public void addExtraNamedOutput(String namedOutput, String localOutputPrefix) { + extraFiles.put(namedOutput, localOutputPrefix); + } + + protected void getConcatMrOutputToLocal() throws IOException { + concatMrOutputToLocal(outdir, localOutput, true, namedOutput); + + for (Map.Entry entry : extraFiles.entrySet()) { + String suffix = entry.getValue(); + String partFilePrefix = entry.getKey(); + Path extraOutput = localOutput.suffix(suffix); + concatMrOutputToLocal(outdir, extraOutput, true, partFilePrefix); + printKeyValue(SshMRExecutor.EXTRA_OUTPUT_PREFIX + partFilePrefix.toUpperCase(), extraOutput); + } + } + public Path getLocalOutput() { return localOutput; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java index 612f3183a98..faea9185887 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java @@ -35,6 +35,7 @@ public class SshMRExecutor extends MRExecutor { // env-var expected by "sshpass -e" private static final String SSHPASS_ENV = "SSHPASS"; public static final String PID = "PID"; + public static final String EXTRA_OUTPUT_PREFIX = "EXTRA_OUTPUT_"; private static Logger logger = LoggerFactory.getLogger(SshMRExecutor.class); @Override @@ -107,7 +108,7 @@ public Result run(String executable, String[] args) throws StorageEngineExceptio if (exitValue == 0) { copyOutputFiles(args, env); for (String key : result.keySet()) { - if (key.startsWith("EXTRA_OUTPUT_")) { + if (key.startsWith(EXTRA_OUTPUT_PREFIX)) { copyOutputFiles(result.getString(key), env); } } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java index 223c0b91559..7a2324e17f2 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java @@ -1,7 +1,6 @@ package org.opencb.opencga.storage.hadoop.variant.io; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.mapred.JobContext; import org.apache.hadoop.mapreduce.Job; @@ -50,8 +49,7 @@ public abstract class VariantDriver extends AbstractVariantsTableDriver { public static final String OUTPUT_PARAM = "output"; public static final String CONCAT_OUTPUT_PARAM = "concat-output"; - protected Path outdir; - protected Path localOutput; + protected MapReduceOutputFile output; private final Query query = new Query(); private final QueryOptions options = new QueryOptions(); private static Logger logger = LoggerFactory.getLogger(VariantDriver.class); @@ -61,25 +59,9 @@ public abstract class VariantDriver extends AbstractVariantsTableDriver { protected void parseAndValidateParameters() throws IOException { setStudyId(-1); super.parseAndValidateParameters(); - String outdirStr = getParam(OUTPUT_PARAM); - if (StringUtils.isEmpty(outdirStr)) { - throw new IllegalArgumentException("Missing argument " + OUTPUT_PARAM); - } - useReduceStep = Boolean.valueOf(getParam(CONCAT_OUTPUT_PARAM)); - outdir = new Path(outdirStr); - if (isLocal(outdir)) { - localOutput = getLocalOutput(outdir); - outdir = getTempOutdir("opencga_export", localOutput.getName()); - outdir.getFileSystem(getConf()).deleteOnExit(outdir); - } - if (localOutput != null) { - useReduceStep = true; - logger.info(" * Outdir file: " + localOutput.toUri()); - logger.info(" * Temporary outdir file: " + outdir.toUri()); - } else { - logger.info(" * Outdir file: " + outdir.toUri()); - } +// useReduceStep = Boolean.valueOf(getParam(CONCAT_OUTPUT_PARAM)); + output = new MapReduceOutputFile(getTableNameGenerator().getDbName() + "_" + getClass().getSimpleName()); getQueryFromConfig(query, getConf()); getQueryOptionsFromConfig(options, getConf()); @@ -156,7 +138,7 @@ protected final Job setupJob(Job job, String archiveTable, String variantTable) setNoneTimestamp(job); - FileOutputFormat.setOutputPath(job, outdir); // set Path + FileOutputFormat.setOutputPath(job, output.getOutdir()); // set Path VariantMapReduceUtil.configureVariantConverter(job.getConfiguration(), false, true, true, query.getString(VariantQueryParam.UNKNOWN_GENOTYPE.key(), "./.")); @@ -193,16 +175,7 @@ protected void setupReducer(Job job, String variantTable) throws IOException { @Override protected void postExecution(boolean succeed) throws IOException, StorageEngineException { super.postExecution(succeed); - if (localOutput != null) { - if (succeed) { - copyMrOutputToLocal(); - } - deleteTemporaryFile(outdir); - } - } - - protected void copyMrOutputToLocal() throws IOException { - concatMrOutputToLocal(outdir, localOutput, true, null); + output.postExecute(succeed); } } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java index d2489fac1c7..f5c38ec7c7c 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java @@ -11,11 +11,11 @@ import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.JobContext; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Partitioner; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.parquet.Log; import org.apache.parquet.avro.AvroParquetOutputFormat; import org.apache.parquet.hadoop.ParquetOutputFormat; @@ -43,7 +43,7 @@ public class VariantExporterDriver extends VariantDriver { private VariantWriterFactory.VariantOutputFormat outputFormat; private Class mapperClass; private Class reducerClass; - private Class outputFormatClass; + private Class outputFormatClass; private Class partitioner; @Override @@ -69,7 +69,7 @@ protected Class getPartitioner() { } @Override - protected Class getOutputFormatClass() { + protected Class getOutputFormatClass() { return outputFormatClass; } @@ -126,17 +126,25 @@ protected void setupJob(Job job) throws IOException { mapperClass = AvroVariantExporterMapper.class; reducerClass = VariantExporterReducer.class; partitioner = VariantLocusKeyPartitioner.class; + outputFormatClass = VariantFileOutputFormat.class; } else { AvroJob.setOutputKeySchema(job, VariantAvro.getClassSchema()); - mapperClass = VariantExporterDirectMapper.class; + mapperClass = VariantExporterDirectMultipleOutputsMapper.class; +// mapperClass = VariantExporterDirectMapper.class; + reducerClass = null; + +// MultipleOutputs.setCountersEnabled(job, true); + MultipleOutputs.addNamedOutput(job, VariantExporterDirectMultipleOutputsMapper.NAMED_OUTPUT, + VariantFileOutputFormat.class, Variant.class, NullWritable.class); + LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); + outputFormatClass = LazyOutputFormat.class; } if (outputFormat.isGzip()) { FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); // compression } else if (outputFormat.isSnappy()) { FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); // compression } - outputFormatClass = VariantFileOutputFormat.class; job.getConfiguration().set(VariantFileOutputFormat.VARIANT_OUTPUT_FORMAT, outputFormat.name()); job.setOutputKeyClass(Variant.class); break; @@ -169,6 +177,69 @@ protected void map(Object key, Variant value, Context context) throws IOExceptio } } + /** + * Mapper to convert to Variant. + * The output of this mapper should be connected directly to the {@link VariantWriterFactory.VariantOutputFormat} + * This mapper can not work with a reduce step. + * The output is written to multiple outputs, ensuring that generated files are sorted by chromosome and position. + */ + public static class VariantExporterDirectMultipleOutputsMapper extends VariantMapper { + + public static final String NAMED_OUTPUT = "export"; + private String baseOutputPath; + private String chromosome; + + public static String buildOutputKeyPrefix(String chromosome, Integer start) { + // If it's a single digit chromosome, add a 0 at the beginning + // 1 -> 01 + // 3 -> 03 + // 22 -> 22 + // If the first character is a digit, and the second is not, add a 0 at the beginning + // MT -> MT + // 1_KI270712v1_random -> 01_KI270712v1_random + if (VariantLocusKey.isSingleDigitChromosome(chromosome)) { + chromosome = "0" + chromosome; + } + + return String.format("%s.%s.%010d.", NAMED_OUTPUT, chromosome, start); + } + + private MultipleOutputs mos; + @Override + protected void setup(Context context) throws IOException, InterruptedException { + super.setup(context); + mos = new MultipleOutputs<>(context); + context.getCounter(COUNTER_GROUP_NAME, "variants").increment(0); + } + + @Override + protected void map(Object key, Variant value, Context context) throws IOException, InterruptedException { + context.getCounter(COUNTER_GROUP_NAME, "variants").increment(1); + if (baseOutputPath == null || !consecutiveChromosomes(chromosome, value.getChromosome())) { + baseOutputPath = buildOutputKeyPrefix(value.getChromosome(), value.getStart()); + chromosome = value.getChromosome(); + } + mos.write(NAMED_OUTPUT, value, NullWritable.get(), baseOutputPath); + } + + private static boolean consecutiveChromosomes(String prevChromosome, String newChromosome) { + if (newChromosome.equals(prevChromosome)) { + return true; + } + if (VariantLocusKey.isSingleDigitChromosome(prevChromosome)) { + return VariantLocusKey.isSingleDigitChromosome(newChromosome); + } else { + return !VariantLocusKey.isSingleDigitChromosome(newChromosome); + } + } + + @Override + protected void cleanup(Mapper.Context context) throws IOException, InterruptedException { + super.cleanup(context); + mos.close(); + } + } + /** * Mapper to convert to VariantAvro. * The output of this mapper should be connected directly to the {@link AvroKeyOutputFormat} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java index b6cedb5d484..c9a42c9be74 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java @@ -1,7 +1,6 @@ package org.opencb.opencga.storage.hadoop.variant.mr; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.DeflateCodec; @@ -34,6 +33,8 @@ public class StreamVariantDriver extends VariantDriver { public static final String MAX_BYTES_PER_MAP_PARAM = "maxBytesPerMap"; public static final String ENVIRONMENT_VARIABLES = "envVars"; public static final String STDERR_TXT_GZ = ".stderr.txt.gz"; + public static final String STDOUT_NAMED_OUTPUT = "stdout"; + public static final String STDERR_NAMED_OUTPUT = "stderr"; private VariantWriterFactory.VariantOutputFormat format; private int maxBytesPerMap; @@ -101,6 +102,9 @@ protected void parseAndValidateParameters() throws IOException { if (StringUtils.isEmpty(outdirStr)) { throw new IllegalArgumentException("Missing argument " + OUTPUT_PARAM); } + + output.setNamedOutput(STDOUT_NAMED_OUTPUT); + output.addExtraNamedOutput(STDERR_NAMED_OUTPUT, STDERR_TXT_GZ); } @Override @@ -144,10 +148,12 @@ protected void setupJob(Job job) throws IOException { StreamVariantMapper.setMaxInputBytesPerProcess(job, maxBytesPerMap); StreamVariantMapper.setEnvironment(job, envVars); + // Current implementation only supports using the reduce step + useReduceStep = true; reducerClass = StreamVariantReducer.class; - MultipleOutputs.addNamedOutput(job, "stdout", ValueOnlyTextOutputFormat.class, keyClass, valueClass); - MultipleOutputs.addNamedOutput(job, "stderr", ValueOnlyTextOutputFormat.class, keyClass, valueClass); + MultipleOutputs.addNamedOutput(job, STDOUT_NAMED_OUTPUT, ValueOnlyTextOutputFormat.class, keyClass, valueClass); + MultipleOutputs.addNamedOutput(job, STDERR_NAMED_OUTPUT, ValueOnlyTextOutputFormat.class, keyClass, valueClass); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); outputFormatClass = LazyOutputFormat.class; @@ -172,15 +178,6 @@ protected String getJobOperationName() { return "stream-variants"; } - - @Override - protected void copyMrOutputToLocal() throws IOException { - concatMrOutputToLocal(outdir, localOutput, true, "stdout"); - Path stderrOutput = localOutput.suffix(STDERR_TXT_GZ); - concatMrOutputToLocal(outdir, stderrOutput, true, "stderr"); - printKeyValue("EXTRA_OUTPUT_STDERR", stderrOutput); - } - @SuppressWarnings("unchecked") public static void main(String[] args) { main(args, (Class) MethodHandles.lookup().lookupClass()); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java index c10bbcb2591..695ed56e832 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantReducer.java @@ -41,7 +41,7 @@ protected void reduce(VariantLocusKey key, Iterable values, // skip header context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "header_records_skip").increment(1); } else { - mos.write("stdout", key, value); + mos.write(StreamVariantDriver.STDOUT_NAMED_OUTPUT, key, value); context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "header_records").increment(1); } } else { @@ -53,14 +53,14 @@ protected void reduce(VariantLocusKey key, Iterable values, // No more header, assume all header is written headerWritten = true; } - mos.write("stdout", key, value); + mos.write(StreamVariantDriver.STDOUT_NAMED_OUTPUT, key, value); context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "body_records").increment(1); } context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stdout_records").increment(1); context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stdout_records_bytes") .increment(value.getLength()); } else if (key.getOther().startsWith(STDERR_KEY)) { - mos.write("stderr", key, value); + mos.write(StreamVariantDriver.STDERR_NAMED_OUTPUT, key, value); context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stderr_records").increment(1); context.getCounter(VariantsTableMapReduceHelper.COUNTER_GROUP_NAME, "stderr_records_bytes") .increment(value.getLength()); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/utils/HBaseVariantTableNameGenerator.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/utils/HBaseVariantTableNameGenerator.java index 39d8e3868fc..4777c7a8767 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/utils/HBaseVariantTableNameGenerator.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/utils/HBaseVariantTableNameGenerator.java @@ -58,6 +58,10 @@ public HBaseVariantTableNameGenerator(String namespace, String dbName) { pendingSecondaryIndexPruneTableName = getPendingSecondaryIndexPruneTableName(namespace, this.dbName); } + public String getDbName() { + return dbName; + } + public String getVariantTableName() { return variantTableName; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageTest.java index cf4d7984ada..4ddc5d03a0a 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageTest.java @@ -420,6 +420,7 @@ static StorageConfiguration updateStorageConfiguration(StorageConfiguration stor TestMRExecutor.setStaticConfiguration(conf); options.put(HadoopVariantStorageOptions.MR_ADD_DEPENDENCY_JARS.key(), false); + options.put("storage.hadoop.mr.skipReportRunningJobs", true); EnumSet supportedAlgorithms = EnumSet.of(Compression.Algorithm.NONE, HBaseTestingUtility.getSupportedCompressionAlgorithms()); options.put(HadoopVariantStorageOptions.ARCHIVE_TABLE_COMPRESSION.key(), supportedAlgorithms.contains(Compression.Algorithm.GZ) From c4c3d3b4ac0d25e068ae7ee0d9e81a8c4b06c167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 7 Nov 2024 15:40:15 +0000 Subject: [PATCH 27/66] storage: Do not use reduce step on variant-walker. #TASK-6722 --- .../variant/io/HadoopVariantExporter.java | 5 +- .../variant/io/VariantExporterDriver.java | 6 +- .../variant/mr/StreamVariantDriver.java | 10 +++- .../variant/mr/StreamVariantMapper.java | 56 ++++++++++++++++--- 4 files changed, 62 insertions(+), 15 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java index 2bc76ab5e99..b16ef093616 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java @@ -178,15 +178,16 @@ public List export(@Nullable URI outputFileUri, VariantWriterFactory.Varian logger.info("Count {}/{} variants from query {}", count, totalCount, getSearchEngineQuery(query)); if (count < variantsThreshold || matchRate < matchRatioThreshold) { logger.info("Query for approximately {} of {} variants, which is {}% of the total." + + " Current variants threshold is {}, and matchRatioThreshold is {}% ." + " Consider small query." + " Skip MapReduce", - count, totalCount, String.format("%.2f", matchRate * 100)); + count, totalCount, String.format("%.4f", matchRate * 100), variantsThreshold, matchRatioThreshold); smallQuery = true; } else { logger.info("Query for approximately {} of {} variants, which is {}% of the total." + " Current variants threshold is {}, and matchRatioThreshold is {}% ." + " Not a small query", - count, totalCount, String.format("%.2f", matchRate * 100), variantsThreshold, matchRatioThreshold); + count, totalCount, String.format("%.3f", matchRate * 100), variantsThreshold, matchRatioThreshold); } } catch (VariantSearchException e) { logger.info("Unable to count variants from SearchEngine", e); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java index f5c38ec7c7c..4c7abd32863 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java @@ -189,7 +189,7 @@ public static class VariantExporterDirectMultipleOutputsMapper extends VariantMa private String baseOutputPath; private String chromosome; - public static String buildOutputKeyPrefix(String chromosome, Integer start) { + public static String buildOutputKeyPrefix(String namedOutput, String chromosome, Integer start) { // If it's a single digit chromosome, add a 0 at the beginning // 1 -> 01 // 3 -> 03 @@ -201,7 +201,7 @@ public static String buildOutputKeyPrefix(String chromosome, Integer start) { chromosome = "0" + chromosome; } - return String.format("%s.%s.%010d.", NAMED_OUTPUT, chromosome, start); + return String.format("%s.%s.%010d.", namedOutput, chromosome, start); } private MultipleOutputs mos; @@ -216,7 +216,7 @@ protected void setup(Context context) throws IOException, InterruptedException { protected void map(Object key, Variant value, Context context) throws IOException, InterruptedException { context.getCounter(COUNTER_GROUP_NAME, "variants").increment(1); if (baseOutputPath == null || !consecutiveChromosomes(chromosome, value.getChromosome())) { - baseOutputPath = buildOutputKeyPrefix(value.getChromosome(), value.getStart()); + baseOutputPath = buildOutputKeyPrefix(NAMED_OUTPUT, value.getChromosome(), value.getStart()); chromosome = value.getChromosome(); } mos.write(NAMED_OUTPUT, value, NullWritable.get(), baseOutputPath); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java index c9a42c9be74..0985b6c0f6c 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java @@ -148,9 +148,13 @@ protected void setupJob(Job job) throws IOException { StreamVariantMapper.setMaxInputBytesPerProcess(job, maxBytesPerMap); StreamVariantMapper.setEnvironment(job, envVars); - // Current implementation only supports using the reduce step - useReduceStep = true; - reducerClass = StreamVariantReducer.class; + if (useReduceStep) { + reducerClass = StreamVariantReducer.class; + StreamVariantMapper.setHasReduce(job, true); + } else { + reducerClass = null; + StreamVariantMapper.setHasReduce(job, false); + } MultipleOutputs.addNamedOutput(job, STDOUT_NAMED_OUTPUT, ValueOnlyTextOutputFormat.class, keyClass, valueClass); MultipleOutputs.addNamedOutput(job, STDERR_NAMED_OUTPUT, ValueOnlyTextOutputFormat.class, keyClass, valueClass); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index f0a1de0f73e..4805e5fe6ba 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -10,6 +10,7 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; import org.apache.hadoop.util.LineReader; import org.apache.hadoop.util.StopWatch; import org.opencb.biodata.models.variant.Variant; @@ -29,16 +30,20 @@ import java.util.*; import java.util.concurrent.TimeUnit; +import static org.opencb.opencga.storage.hadoop.variant.io.VariantExporterDriver.VariantExporterDirectMultipleOutputsMapper.buildOutputKeyPrefix; +import static org.opencb.opencga.storage.hadoop.variant.mr.StreamVariantDriver.STDERR_NAMED_OUTPUT; +import static org.opencb.opencga.storage.hadoop.variant.mr.StreamVariantDriver.STDOUT_NAMED_OUTPUT; import static org.opencb.opencga.storage.hadoop.variant.mr.VariantsTableMapReduceHelper.COUNTER_GROUP_NAME; public class StreamVariantMapper extends VariantMapper { private static final Log LOG = LogFactory.getLog(StreamVariantMapper.class); private static final int BUFFER_SIZE = 128 * 1024; - public static final String MAX_INPUT_BYTES_PER_PROCESS = "stream.maxInputBytesPerProcess"; + public static final String MAX_INPUT_BYTES_PER_PROCESS = "opencga.variant.stream.maxInputBytesPerProcess"; public static final String VARIANT_FORMAT = "opencga.variant.stream.format"; - public static final String COMMANDLINE_BASE64 = "opencga.variant.commandline_base64"; - public static final String ADDENVIRONMENT_PARAM = "opencga.variant.addenvironment"; + public static final String COMMANDLINE_BASE64 = "opencga.variant.stream.commandline_base64"; + public static final String ADDENVIRONMENT_PARAM = "opencga.variant.stream.addenvironment"; + public static final String HAS_REDUCE = "opencga.variant.stream.hasReduce"; private final boolean verboseStdout = false; private static final long REPORTER_OUT_DELAY = 10 * 1000L; @@ -54,6 +59,7 @@ public class StreamVariantMapper extends VariantMapper { private Query query; private QueryOptions options; private String firstVariant; + private boolean multipleOutputs; private int processCount = 0; @@ -71,6 +77,7 @@ public class StreamVariantMapper extends VariantMapper { private int processedBytes = 0; private long numRecordsRead = 0; private long numRecordsWritten = 0; + private MultipleOutputs mos; // auto-incremental number for each produced record. // These are used with the VariantLocusKey to ensure a sorted output. private int stdoutKeyNum; @@ -94,6 +101,10 @@ public static void setMaxInputBytesPerProcess(Job job, int maxInputBytesPerProce job.getConfiguration().setInt(MAX_INPUT_BYTES_PER_PROCESS, maxInputBytesPerProcess); } + public static void setHasReduce(Job job, boolean hasReduce) { + job.getConfiguration().setBoolean(HAS_REDUCE, hasReduce); + } + @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); @@ -104,6 +115,14 @@ protected void setup(Context context) throws IOException, InterruptedException { if (!format.isPlain()) { format = format.inPlain(); } + if (conf.getBoolean(HAS_REDUCE, false)) { + // If the job has a reduce step, the output will be written by the reducer + // No need to write the output here + multipleOutputs = false; + } else { + // If the job does not have a reduce step, the output will be written by the mapper + multipleOutputs = true; + } envs = new HashMap<>(); addEnvironment(envs, conf); @@ -327,6 +346,15 @@ private void closeProcess(Context context) throws IOException, InterruptedExcept } catch (Throwable th) { addException(th); } + + try { + if (mos != null) { + mos.close(); + mos = null; + } + } catch (Throwable th) { + addException(th); + } // drainStdout(context); } @@ -340,6 +368,9 @@ private void startProcess(Context context) throws IOException, StorageEngineExce if (firstVariant == null) { firstVariant = variant.getChromosome() + ":" + variant.getStart(); } + if (multipleOutputs) { + mos = new MultipleOutputs<>(context); + } stdoutKeyNum = 0; stderrKeyNum = 0; @@ -469,8 +500,13 @@ public void run() { private void write(Text line) throws IOException, InterruptedException { numRecords++; - context.write(new VariantLocusKey(currentChromosome, currentPosition, - StreamVariantReducer.STDOUT_KEY + (stdoutKeyNum++)), line); + VariantLocusKey locusKey = new VariantLocusKey(currentChromosome, currentPosition, + StreamVariantReducer.STDOUT_KEY + (stdoutKeyNum++)); + if (multipleOutputs) { + mos.write(STDOUT_NAMED_OUTPUT, locusKey, line, buildOutputKeyPrefix(STDOUT_NAMED_OUTPUT, currentChromosome, currentPosition)); + } else { + context.write(locusKey, line); + } } } @@ -551,8 +587,14 @@ private void write(String line) throws IOException, InterruptedException { } private void write(Text line) throws IOException, InterruptedException { - context.write(new VariantLocusKey(currentChromosome, currentPosition, - StreamVariantReducer.STDERR_KEY + (stderrKeyNum++)), line); + VariantLocusKey locusKey = new VariantLocusKey(currentChromosome, currentPosition, + StreamVariantReducer.STDERR_KEY + (stderrKeyNum++)); + + if (multipleOutputs) { + mos.write(STDERR_NAMED_OUTPUT, locusKey, line, buildOutputKeyPrefix(STDERR_NAMED_OUTPUT, currentChromosome, currentPosition)); + } else { + context.write(locusKey, line); + } } private boolean matchesReporter(String line) { From 0100097cdfa5b1c4dc8398b93931062767f31c5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 7 Nov 2024 16:14:29 +0000 Subject: [PATCH 28/66] storage: Fix VariantRecordWriter bytes_written counter. #TASK-6722 --- .../opencga/storage/hadoop/utils/AbstractHBaseDriver.java | 7 +++---- .../storage/hadoop/variant/io/HadoopVariantExporter.java | 4 ++-- .../storage/hadoop/variant/io/VariantExporterDriver.java | 2 +- .../storage/hadoop/variant/mr/StreamVariantMapper.java | 6 ++++-- .../storage/hadoop/variant/mr/VariantFileOutputFormat.java | 7 ++++--- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index 7a069d6efa3..685ece6c459 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -368,7 +368,6 @@ protected Path getTempOutdir(String prefix, String suffix, boolean ensureHdfs) t } } } - LOGGER.info("Temporary directory: " + toUri(tmpDir)); return new Path(tmpDir, fileName); } @@ -454,10 +453,10 @@ protected void getOutputPath() throws IOException { outdir.getFileSystem(getConf()).deleteOnExit(outdir); } if (localOutput != null) { - LOGGER.info(" * Outdir file: " + toUri(localOutput)); - LOGGER.info(" * Temporary outdir file: " + toUri(outdir)); + LOGGER.info(" * Output file : " + toUri(localOutput)); + LOGGER.info(" * Temporary outdir : " + toUri(outdir)); } else { - LOGGER.info(" * Outdir file: " + toUri(outdir)); + LOGGER.info(" * Outdir: " + toUri(outdir)); } } } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java index b16ef093616..53511b84739 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java @@ -181,13 +181,13 @@ public List export(@Nullable URI outputFileUri, VariantWriterFactory.Varian + " Current variants threshold is {}, and matchRatioThreshold is {}% ." + " Consider small query." + " Skip MapReduce", - count, totalCount, String.format("%.4f", matchRate * 100), variantsThreshold, matchRatioThreshold); + count, totalCount, String.format("%.4f", matchRate * 100), variantsThreshold, matchRatioThreshold * 100); smallQuery = true; } else { logger.info("Query for approximately {} of {} variants, which is {}% of the total." + " Current variants threshold is {}, and matchRatioThreshold is {}% ." + " Not a small query", - count, totalCount, String.format("%.3f", matchRate * 100), variantsThreshold, matchRatioThreshold); + count, totalCount, String.format("%.3f", matchRate * 100), variantsThreshold, matchRatioThreshold * 100); } } catch (VariantSearchException e) { logger.info("Unable to count variants from SearchEngine", e); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java index 4c7abd32863..b29287a407d 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java @@ -201,7 +201,7 @@ public static String buildOutputKeyPrefix(String namedOutput, String chromosome, chromosome = "0" + chromosome; } - return String.format("%s.%s.%010d.", namedOutput, chromosome, start); + return String.format("%s.%s.%010d", namedOutput, chromosome, start); } private MultipleOutputs mos; diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 4805e5fe6ba..2af52990fb6 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -503,7 +503,8 @@ private void write(Text line) throws IOException, InterruptedException { VariantLocusKey locusKey = new VariantLocusKey(currentChromosome, currentPosition, StreamVariantReducer.STDOUT_KEY + (stdoutKeyNum++)); if (multipleOutputs) { - mos.write(STDOUT_NAMED_OUTPUT, locusKey, line, buildOutputKeyPrefix(STDOUT_NAMED_OUTPUT, currentChromosome, currentPosition)); + mos.write(STDOUT_NAMED_OUTPUT, locusKey, line, + buildOutputKeyPrefix(STDOUT_NAMED_OUTPUT, currentChromosome, currentPosition)); } else { context.write(locusKey, line); } @@ -591,7 +592,8 @@ private void write(Text line) throws IOException, InterruptedException { StreamVariantReducer.STDERR_KEY + (stderrKeyNum++)); if (multipleOutputs) { - mos.write(STDERR_NAMED_OUTPUT, locusKey, line, buildOutputKeyPrefix(STDERR_NAMED_OUTPUT, currentChromosome, currentPosition)); + mos.write(STDERR_NAMED_OUTPUT, locusKey, line, + buildOutputKeyPrefix(STDERR_NAMED_OUTPUT, currentChromosome, currentPosition)); } else { context.write(locusKey, line); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java index 16dd0fffa3f..ddff988a119 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java @@ -73,7 +73,8 @@ public RecordWriter getRecordWriter(TaskAttemptContext jo if (isCompressed) { out = new DataOutputStream(codec.createOutputStream(out)); } - return new VariantRecordWriter(configureWriter(job, out), out); + CountingOutputStream countingOut = new CountingOutputStream(out); + return new VariantRecordWriter(configureWriter(job, countingOut), countingOut); } private DataWriter configureWriter(final TaskAttemptContext job, OutputStream fileOut) throws IOException { @@ -103,9 +104,9 @@ protected static class VariantRecordWriter extends RecordWriter writer; private final CountingOutputStream outputStream; - public VariantRecordWriter(DataWriter writer, OutputStream outputStream) { + public VariantRecordWriter(DataWriter writer, CountingOutputStream outputStream) { this.writer = writer; - this.outputStream = new CountingOutputStream(outputStream); + this.outputStream = outputStream; } @Override From b52ca2738e62e9df3ec0bbe79a43255dc9dff53f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 8 Nov 2024 19:42:38 +0000 Subject: [PATCH 29/66] storage: Reduce number of intermediate mapper files. #TASK-6722 --- .../variant/io/VariantExporterDriver.java | 13 +---- .../variant/mr/StreamVariantMapper.java | 30 ++++++----- .../hadoop/variant/mr/VariantLocusKey.java | 54 +++++++++++++++++++ .../variant/mr/VariantLocusKeyTest.java | 22 ++++++++ 4 files changed, 93 insertions(+), 26 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java index b29287a407d..7b1e96f22ea 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java @@ -215,24 +215,13 @@ protected void setup(Context context) throws IOException, InterruptedException { @Override protected void map(Object key, Variant value, Context context) throws IOException, InterruptedException { context.getCounter(COUNTER_GROUP_NAME, "variants").increment(1); - if (baseOutputPath == null || !consecutiveChromosomes(chromosome, value.getChromosome())) { + if (baseOutputPath == null || !VariantLocusKey.naturalConsecutiveChromosomes(chromosome, value.getChromosome())) { baseOutputPath = buildOutputKeyPrefix(NAMED_OUTPUT, value.getChromosome(), value.getStart()); chromosome = value.getChromosome(); } mos.write(NAMED_OUTPUT, value, NullWritable.get(), baseOutputPath); } - private static boolean consecutiveChromosomes(String prevChromosome, String newChromosome) { - if (newChromosome.equals(prevChromosome)) { - return true; - } - if (VariantLocusKey.isSingleDigitChromosome(prevChromosome)) { - return VariantLocusKey.isSingleDigitChromosome(newChromosome); - } else { - return !VariantLocusKey.isSingleDigitChromosome(newChromosome); - } - } - @Override protected void cleanup(Mapper.Context context) throws IOException, InterruptedException { super.cleanup(context); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 2af52990fb6..b490f251508 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -78,6 +78,8 @@ public class StreamVariantMapper extends VariantMapper { private long numRecordsRead = 0; private long numRecordsWritten = 0; private MultipleOutputs mos; + private String stdoutBaseOutputPath; + private String stderrBaseOutputPath; // auto-incremental number for each produced record. // These are used with the VariantLocusKey to ensure a sorted output. private int stdoutKeyNum; @@ -150,12 +152,11 @@ public void run(Context context) throws IOException, InterruptedException { // or if the chromosome changes if (processedBytes > maxInputBytesPerProcess) { LOG.info("Processed bytes = " + processedBytes + " > " + maxInputBytesPerProcess + ". Restarting process."); - restartProcess(context, "bytes_limit"); - } else if (!currentChromosome.equals(currentValue.getChromosome())) { - // TODO: Should we change only when the chromosome change would produce a partition change? + restartProcess(context, "bytes_limit", false); + } else if (!VariantLocusKey.naturalConsecutiveChromosomes(currentChromosome, currentValue.getChromosome())) { LOG.info("Chromosome changed from " + currentChromosome + " to " + currentValue.getChromosome() + ". Restarting process."); - restartProcess(context, "chr_change"); + restartProcess(context, "chr_change", true); } map(context.getCurrentKey(), currentValue, context); } while (!hasExceptions() && context.nextKeyValue()); @@ -193,10 +194,10 @@ public void run(Context context) throws IOException, InterruptedException { throwExceptionIfAny(); } - private void restartProcess(Mapper.Context context, String reason) + private void restartProcess(Mapper.Context context, String reason, boolean restartOutput) throws IOException, InterruptedException, StorageEngineException { context.getCounter(COUNTER_GROUP_NAME, "restarted_process_" + reason).increment(1); - closeProcess(context); + closeProcess(context, restartOutput); startProcess(context); } @@ -267,7 +268,7 @@ private void throwExceptionIfAny() throws IOException { @Override protected void cleanup(Mapper.Context context) throws IOException, InterruptedException { - closeProcess(context); + closeProcess(context, true); dockerPruneImages(); super.cleanup(context); } @@ -300,7 +301,7 @@ protected void map(Object key, Variant value, Context context) throws IOExceptio processedBytes = stdin.size(); } - private void closeProcess(Context context) throws IOException, InterruptedException { + private void closeProcess(Context context, boolean closeOutputs) throws IOException, InterruptedException { try { if (variantDataWriter != null) { @@ -348,7 +349,8 @@ private void closeProcess(Context context) throws IOException, InterruptedExcept } try { - if (mos != null) { + // Close the MultipleOutputs if required + if (mos != null && closeOutputs) { mos.close(); mos = null; } @@ -368,8 +370,10 @@ private void startProcess(Context context) throws IOException, StorageEngineExce if (firstVariant == null) { firstVariant = variant.getChromosome() + ":" + variant.getStart(); } - if (multipleOutputs) { + if (multipleOutputs && mos == null) { mos = new MultipleOutputs<>(context); + stdoutBaseOutputPath = buildOutputKeyPrefix(STDOUT_NAMED_OUTPUT, currentChromosome, currentPosition); + stderrBaseOutputPath = buildOutputKeyPrefix(STDERR_NAMED_OUTPUT, currentChromosome, currentPosition); } stdoutKeyNum = 0; stderrKeyNum = 0; @@ -503,8 +507,7 @@ private void write(Text line) throws IOException, InterruptedException { VariantLocusKey locusKey = new VariantLocusKey(currentChromosome, currentPosition, StreamVariantReducer.STDOUT_KEY + (stdoutKeyNum++)); if (multipleOutputs) { - mos.write(STDOUT_NAMED_OUTPUT, locusKey, line, - buildOutputKeyPrefix(STDOUT_NAMED_OUTPUT, currentChromosome, currentPosition)); + mos.write(STDOUT_NAMED_OUTPUT, locusKey, line, stdoutBaseOutputPath); } else { context.write(locusKey, line); } @@ -592,8 +595,7 @@ private void write(Text line) throws IOException, InterruptedException { StreamVariantReducer.STDERR_KEY + (stderrKeyNum++)); if (multipleOutputs) { - mos.write(STDERR_NAMED_OUTPUT, locusKey, line, - buildOutputKeyPrefix(STDERR_NAMED_OUTPUT, currentChromosome, currentPosition)); + mos.write(STDERR_NAMED_OUTPUT, locusKey, line, stderrBaseOutputPath); } else { context.write(locusKey, line); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKey.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKey.java index ce6d4926120..b04f6fc6018 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKey.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKey.java @@ -35,6 +35,56 @@ public VariantLocusKey(String chromosome, int position, String other) { this.other = other; } + /** + * Check if two lexicographically ordered chromosomes are consecutive in natural order or if there + * might be other chromosomes in between. + * e.g. + * naturalConsecutiveChromosomes("1", "2") == true + * naturalConsecutiveChromosomes("1", "10") == false + * naturalConsecutiveChromosomes("1", "X") == true + * @param prevChromosome Previous chromosome + * @param newChromosome New chromosome + * @return True if the chromosomes are consecutive in natural order + */ + public static boolean naturalConsecutiveChromosomes(String prevChromosome, String newChromosome) { + if (newChromosome.equals(prevChromosome)) { + return true; + } + if (isDigitChromosome(prevChromosome)) { + // prevChromosome == 1 or 10 + if (isSingleDigitChromosome(prevChromosome)) { + // prevChromosome == 1 + if (isDigitChromosome(newChromosome)) { + // newChromosome == 2 or 10 + // 1 -> 2 : TRUE + // 1 -> 10 : FALSE + return isSingleDigitChromosome(newChromosome); + } else { + // newChromosome == X + // 1 -> X : FALSE + return false; + } + } else { + // prevChromosome == 10 + if (isDigitChromosome(newChromosome)) { + // newChromosome == 11 or 2 + // 10 -> 11 : TRUE + // 10 -> 2 : FALSE + return !isSingleDigitChromosome(newChromosome); + } else { + // newChromosome == X + // 10 -> X : FALSE + return false; + } + } + } else { + // prevChromosome == X + // X -> Y : TRUE + // X -> 1 : FALSE + return !isDigitChromosome(newChromosome); + } + } + @Override public int compareTo(VariantLocusKey o) { String chr1; @@ -69,6 +119,10 @@ public static boolean isSingleDigitChromosome(String chromosome) { return Character.isDigit(chromosome.charAt(0)) && (chromosome.length() == 1 || !Character.isDigit(chromosome.charAt(1))); } + private static boolean isDigitChromosome(String chromosome) { + return Character.isDigit(chromosome.charAt(0)); + } + @Override public void write(DataOutput out) throws IOException { out.writeUTF(chromosome); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyTest.java index 74552d1f241..0535d0e00dc 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantLocusKeyTest.java @@ -175,4 +175,26 @@ private static void testWriteAndRead(VariantLocusKey originalKey) throws IOExcep // Assert that the read object is equal to the original object assertEquals(originalKey, readKey); } + + @Test + public void shouldTestConsecutiveChromosomesWithAlternateConfigs() { + assertTrue(VariantLocusKey.naturalConsecutiveChromosomes("1", "1")); + assertTrue(VariantLocusKey.naturalConsecutiveChromosomes("1", "1_random")); + assertTrue(VariantLocusKey.naturalConsecutiveChromosomes("1", "2")); + assertTrue(VariantLocusKey.naturalConsecutiveChromosomes("1", "3")); + assertTrue(VariantLocusKey.naturalConsecutiveChromosomes("10", "11")); + assertFalse(VariantLocusKey.naturalConsecutiveChromosomes("1", "10")); + assertFalse(VariantLocusKey.naturalConsecutiveChromosomes("9", "10")); + assertFalse(VariantLocusKey.naturalConsecutiveChromosomes("2", "20")); + assertFalse(VariantLocusKey.naturalConsecutiveChromosomes("22", "X")); + assertFalse(VariantLocusKey.naturalConsecutiveChromosomes("1", "X")); + assertTrue(VariantLocusKey.naturalConsecutiveChromosomes("X", "Y")); + assertTrue(VariantLocusKey.naturalConsecutiveChromosomes("X", "Z")); + assertTrue(VariantLocusKey.naturalConsecutiveChromosomes("1_random", "1_random")); + assertTrue(VariantLocusKey.naturalConsecutiveChromosomes("1_randomA", "1_randomB")); + assertTrue(VariantLocusKey.naturalConsecutiveChromosomes("1_randomB", "1_randomA")); + assertTrue(VariantLocusKey.naturalConsecutiveChromosomes("1_random", "2_random")); + assertTrue(VariantLocusKey.naturalConsecutiveChromosomes("10_random", "11_random")); + assertFalse(VariantLocusKey.naturalConsecutiveChromosomes("1_random", "10_random")); + } } \ No newline at end of file From ad3521e0eb5d3f6265cb610a0bd2ece768623705 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 8 Nov 2024 23:04:51 +0000 Subject: [PATCH 30/66] storage: Use SNAPPY as intermediate compression algorithm. #TASK-6722 --- .../hadoop/utils/AbstractHBaseDriver.java | 106 ++++++++++++++---- .../variant/io/VariantExporterDriver.java | 10 +- .../variant/mr/StreamVariantDriver.java | 12 +- 3 files changed, 97 insertions(+), 31 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index 685ece6c459..8fbd131b72f 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -7,16 +7,21 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.*; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.mapreduce.TableInputFormat; import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.compress.*; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.yarn.api.records.ApplicationId; @@ -44,8 +49,6 @@ import java.util.*; import java.util.function.Supplier; import java.util.stream.Collectors; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; import static org.opencb.opencga.core.common.IOUtils.humanReadableByteCount; import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageOptions.MR_EXECUTOR_SSH_PASSWORD; @@ -196,6 +199,11 @@ public final int run(String[] args) throws Exception { LOGGER.info(" - OutputTable : " + job.getConfiguration().get(TableOutputFormat.OUTPUT_TABLE)); } else if (StringUtils.isNotEmpty(job.getConfiguration().get(FileOutputFormat.OUTDIR))) { LOGGER.info(" - Outdir : " + job.getConfiguration().get(FileOutputFormat.OUTDIR)); + + if (TextOutputFormat.getCompressOutput(job)) { + Class compressorClass = TextOutputFormat.getOutputCompressorClass(job, GzipCodec.class); + LOGGER.info(" - Compress : " + compressorClass.getName()); + } } LOGGER.info("================================================="); reportRunningJobs(); @@ -431,18 +439,18 @@ public class MapReduceOutputFile { public MapReduceOutputFile(String tempFilePrefix) throws IOException { this.nameGenerator = () -> null; this.tempFilePrefix = tempFilePrefix; - getOutputPath(); + initOutputPath(); namedOutput = null; } public MapReduceOutputFile(Supplier nameGenerator, String tempFilePrefix) throws IOException { this.nameGenerator = nameGenerator; this.tempFilePrefix = tempFilePrefix; - getOutputPath(); + initOutputPath(); namedOutput = null; } - protected void getOutputPath() throws IOException { + private void initOutputPath() throws IOException { String outdirStr = getParam(OUTPUT_PARAM); if (StringUtils.isNotEmpty(outdirStr)) { outdir = new Path(outdirStr); @@ -452,7 +460,7 @@ protected void getOutputPath() throws IOException { outdir = getTempOutdir(tempFilePrefix, localOutput.getName()); outdir.getFileSystem(getConf()).deleteOnExit(outdir); } - if (localOutput != null) { + if (hasTempOutput()) { LOGGER.info(" * Output file : " + toUri(localOutput)); LOGGER.info(" * Temporary outdir : " + toUri(outdir)); } else { @@ -463,15 +471,19 @@ protected void getOutputPath() throws IOException { public void postExecute(boolean succeed) throws IOException { if (succeed) { - if (localOutput != null) { + if (hasTempOutput()) { getConcatMrOutputToLocal(); } } - if (localOutput != null) { + if (hasTempOutput()) { deleteTemporaryFile(outdir); } } + public boolean hasTempOutput() { + return localOutput != null; + } + public MapReduceOutputFile setNamedOutput(String partFilePrefix) { this.namedOutput = partFilePrefix; return this; @@ -556,27 +568,20 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool fileSystem.copyToLocalFile(false, paths.get(0), localOutput); } else { LOGGER.info("Concat and copy to local : " + paths.size() + " partial files"); - LOGGER.info(" Source : " + mrOutdir.toUri()); - LOGGER.info(" Target : " + localOutput.toUri()); + LOGGER.info(" Source {}: {}", getCompression(paths.get(0).getName()), mrOutdir.toUri()); + LOGGER.info(" Target {}: {}", getCompression(localOutput.getName()), localOutput.toUri()); LOGGER.info(" ---- "); - boolean isGzip = paths.get(0).getName().endsWith(".gz"); - try (FSDataOutputStream fsOs = localOutput.getFileSystem(getConf()).create(localOutput); - OutputStream gzOs = isGzip ? new GZIPOutputStream(fsOs) : null) { - OutputStream os = gzOs == null ? fsOs : gzOs; + + try (OutputStream os = getOutputStreamPlain(localOutput.getName(), localOutput.getFileSystem(getConf()).create(localOutput))) { for (int i = 0; i < paths.size(); i++) { Path path = paths.get(i); - LOGGER.info("[{}] Concat {} : '{}' ({}) ", + LOGGER.info("[{}] Concat {} file : '{}' ({}) ", i, - isGzip ? "gzip file" : "file", + getCompression(path.getName()), path.toUri(), humanReadableByteCount(fileSystem.getFileStatus(path).getLen(), false)); - try (FSDataInputStream fsIs = fileSystem.open(path)) { - InputStream is; - if (isGzip) { - is = new GZIPInputStream(fsIs); - } else { - is = fsIs; - } + try (InputStream isAux = getInputStream(path.getName(), fileSystem.open(path))) { + InputStream is = isAux; // Remove extra headers from all files but the first if (removeExtraHeaders && i != 0) { BufferedReader br = new BufferedReader(new InputStreamReader(is)); @@ -600,6 +605,59 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool return paths; } + private static String getCompression(String name) throws IOException { + if (name.endsWith(".gz")) { + return "gzip"; + } else if (name.endsWith(".snappy")) { + return "snappy"; + } else if (name.endsWith(".lz4")) { + return "lz4"; + } else if (name.endsWith(".zst")) { + return "ztandard"; + } else { + return "plain"; + } + } + + private OutputStream getOutputStreamPlain(String name, OutputStream fsOs) throws IOException { + CompressionCodec codec = getCompressionCodec(name); + if (codec == null) { + return fsOs; + } + return codec.createOutputStream(fsOs); + } + + private CompressionCodec getCompressionCodec(String name) throws IOException { + Class codecClass; + switch (getCompression(name)) { + case "gzip": + codecClass = GzipCodec.class; + break; + case "snappy": + codecClass = SnappyCodec.class; + break; + case "lz4": + codecClass = Lz4Codec.class; + break; + case "ztandard": + codecClass = ZStandardCodec.class; + break; + case "plain": + return null; + default: + throw new IOException("Unknown compression codec for file " + name); + } + return ReflectionUtils.newInstance(codecClass, getConf()); + } + + private InputStream getInputStream(String name, InputStream is) throws IOException { + CompressionCodec codec = getCompressionCodec(name); + if (codec == null) { + return is; + } + return codec.createInputStream(is); + } + protected final int getServersSize(String table) throws IOException { int serversSize; try (HBaseManager hBaseManager = new HBaseManager(getConf())) { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java index 7b1e96f22ea..93a75006fb4 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java @@ -140,10 +140,12 @@ protected void setupJob(Job job) throws IOException { LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); outputFormatClass = LazyOutputFormat.class; } - if (outputFormat.isGzip()) { - FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); // compression - } else if (outputFormat.isSnappy()) { - FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); // compression + if (SnappyCodec.isNativeCodeLoaded()) { + FileOutputFormat.setCompressOutput(job, true); + FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); + } else { + FileOutputFormat.setCompressOutput(job, true); + FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } job.getConfiguration().set(VariantFileOutputFormat.VARIANT_OUTPUT_FORMAT, outputFormat.name()); job.setOutputKeyClass(Variant.class); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java index 0985b6c0f6c..91ac57391dc 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java @@ -5,11 +5,13 @@ import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.DeflateCodec; import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.JobContext; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat; import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; @@ -162,9 +164,13 @@ protected void setupJob(Job job) throws IOException { outputFormatClass = LazyOutputFormat.class; job.setOutputFormatClass(ValueOnlyTextOutputFormat.class); - TextOutputFormat.setCompressOutput(job, true); - TextOutputFormat.setOutputCompressorClass(job, GzipCodec.class); -// TextOutputFormat.setOutputCompressorClass(job, DeflateCodec.class); + if (SnappyCodec.isNativeCodeLoaded()) { + FileOutputFormat.setCompressOutput(job, true); + FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); + } else { + FileOutputFormat.setCompressOutput(job, true); + FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); + } job.setOutputKeyClass(keyClass); job.setOutputValueClass(valueClass); } From ab50d6ef738e4c5cd6687c07d84161e23b744a00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Mon, 11 Nov 2024 09:08:32 +0000 Subject: [PATCH 31/66] storage: Disable flush on AbfsOutputStream. HADOOP-16548 #TASK-6722 --- .../hadoop/variant/io/CountingOutputStream.java | 5 +++++ .../variant/mr/VariantFileOutputFormat.java | 17 ++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/CountingOutputStream.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/CountingOutputStream.java index 93f3dcd9bf8..5a50d3293a2 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/CountingOutputStream.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/CountingOutputStream.java @@ -30,6 +30,11 @@ public void write(byte[] b, int off, int len) throws IOException { count += len; } + @Override + public void close() throws IOException { + out.close(); + } + public long getByteCount() { return count; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java index ddff988a119..248bcc5d165 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantFileOutputFormat.java @@ -17,8 +17,10 @@ package org.opencb.opencga.storage.hadoop.variant.mr; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.azurebfs.services.AbfsOutputStream; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.GzipCodec; @@ -37,6 +39,7 @@ import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseVariantStorageMetadataDBAdaptorFactory; import java.io.DataOutputStream; +import java.io.FilterOutputStream; import java.io.IOException; import java.io.OutputStream; @@ -69,7 +72,19 @@ public RecordWriter getRecordWriter(TaskAttemptContext jo } Path file = this.getDefaultWorkFile(job, extension); FileSystem fs = file.getFileSystem(conf); - OutputStream out = fs.create(file, false); + FSDataOutputStream fsOs = fs.create(file, false); + OutputStream out; + if (fsOs.getWrappedStream() instanceof AbfsOutputStream) { + // Disable flush on ABFS. See HADOOP-16548 + out = new FilterOutputStream(fsOs) { + @Override + public void flush() throws IOException { + // Do nothing + } + }; + } else { + out = fsOs; + } if (isCompressed) { out = new DataOutputStream(codec.createOutputStream(out)); } From 212f8ce32503059da25a489a9b9850584b6beeb2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Mon, 11 Nov 2024 10:34:23 +0000 Subject: [PATCH 32/66] storage: Centralize variantMapperJob initialitation. #TASK-6722 --- .../hadoop/utils/AbstractHBaseDriver.java | 9 ++++ .../analysis/julie/JulieToolDriver.java | 2 +- .../hadoop/variant/io/VariantDriver.java | 35 +----------- .../AbstractHBaseVariantTableInputFormat.java | 1 + .../mr/SampleIndexTableRecordReader.java | 8 +++ .../variant/mr/VariantMapReduceUtil.java | 53 ++++++++++--------- 6 files changed, 48 insertions(+), 60 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index 8fbd131b72f..d2d7a682eb6 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -37,6 +37,7 @@ import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.hadoop.io.HDFSIOConnector; import org.opencb.opencga.storage.hadoop.variant.executors.SshMRExecutor; +import org.opencb.opencga.storage.hadoop.variant.mr.AbstractHBaseVariantTableInputFormat; import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -170,6 +171,14 @@ public final int run(String[] args) throws Exception { LOGGER.info(" * InputFormat : " + job.getInputFormatClass().getName()); if (StringUtils.isNotEmpty(job.getConfiguration().get(TableInputFormat.INPUT_TABLE))) { LOGGER.info(" - InputTable : " + job.getConfiguration().get(TableInputFormat.INPUT_TABLE)); + if (job.getConfiguration().getBoolean(AbstractHBaseVariantTableInputFormat.USE_SAMPLE_INDEX_TABLE_INPUT_FORMAT, false)) { + String sampleIndexTable = job.getConfiguration().get(AbstractHBaseVariantTableInputFormat.SAMPLE_INDEX_TABLE); + if (StringUtils.isNotEmpty(sampleIndexTable)) { + LOGGER.info(" - SecondarySampleIndexTable : " + sampleIndexTable); + } else { + LOGGER.info(" - SecondarySampleIndexTable : (not set)"); + } + } } else if (StringUtils.isNotEmpty(job.getConfiguration().get(PhoenixConfigurationUtil.INPUT_TABLE_NAME))) { LOGGER.info(" - InputPTable : " + job.getConfiguration().get(PhoenixConfigurationUtil.INPUT_TABLE_NAME)); } else if (StringUtils.isNotEmpty(job.getConfiguration().get(FileInputFormat.INPUT_DIR))) { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/julie/JulieToolDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/julie/JulieToolDriver.java index 5192e7356f9..25b80a7ef29 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/julie/JulieToolDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/julie/JulieToolDriver.java @@ -104,7 +104,7 @@ protected Job setupJob(Job job, String archiveTable, String variantTable) throws VariantMapReduceUtil.configureMapReduceScan(scan, getConf()); logger.info("Scan: " + scan); - VariantMapReduceUtil.initVariantRowMapperJobFromHBase(job, variantTable, scan, JulieToolMapper.class, false); + VariantMapReduceUtil.initVariantRowMapperJobFromHBase(job, variantTable, scan, JulieToolMapper.class); VariantMapReduceUtil.setOutputHBaseTable(job, variantTable); VariantMapReduceUtil.setNoneReduce(job); VariantMapReduceUtil.setNoneTimestamp(job); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java index 7a2324e17f2..4a78cb70f69 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java @@ -1,7 +1,6 @@ package org.opencb.opencga.storage.hadoop.variant.io; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.mapred.JobContext; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.OutputFormat; @@ -12,21 +11,13 @@ import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; -import org.opencb.opencga.storage.core.variant.query.ParsedVariantQuery; -import org.opencb.opencga.storage.core.variant.query.VariantQueryParser; import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver; -import org.opencb.opencga.storage.hadoop.variant.HadoopVariantQueryParser; -import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHBaseQueryParser; -import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantSqlQueryParser; -import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexDBAdaptor; -import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexQueryParser; import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil; import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.List; import java.util.Map; import static org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil.getQueryFromConfig; @@ -110,31 +101,7 @@ protected final Job setupJob(Job job, String archiveTable, String variantTable) VariantMapReduceUtil.setNoneReduce(job); } - VariantQueryParser variantQueryParser = new HadoopVariantQueryParser(null, getMetadataManager()); - ParsedVariantQuery variantQuery = variantQueryParser.parseQuery(query, options); - Query query = variantQuery.getQuery(); - if (VariantHBaseQueryParser.isSupportedQuery(query)) { - logger.info("Init MapReduce job reading from HBase"); - boolean useSampleIndex = !getConf().getBoolean("skipSampleIndex", false) && SampleIndexQueryParser.validSampleIndexQuery(query); - if (useSampleIndex) { - // Remove extra fields from the query - new SampleIndexDBAdaptor(getHBaseManager(), getTableNameGenerator(), getMetadataManager()).parseSampleIndexQuery(query); - - logger.info("Use sample index to read from HBase"); - } - - VariantHBaseQueryParser parser = new VariantHBaseQueryParser(getMetadataManager()); - List scans = parser.parseQueryMultiRegion(variantQuery, options); - VariantMapReduceUtil.configureMapReduceScans(scans, getConf()); - - VariantMapReduceUtil.initVariantMapperJobFromHBase(job, variantTable, scans, mapperClass, useSampleIndex); - } else { - logger.info("Init MapReduce job reading from Phoenix"); - String sql = new VariantSqlQueryParser(variantTable, getMetadataManager(), getHelper().getConf()) - .parse(variantQuery, options); - - VariantMapReduceUtil.initVariantMapperJobFromPhoenix(job, variantTable, sql, mapperClass); - } + VariantMapReduceUtil.initVariantMapperJob(job, mapperClass, variantTable, getMetadataManager(), query, options, false); setNoneTimestamp(job); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/AbstractHBaseVariantTableInputFormat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/AbstractHBaseVariantTableInputFormat.java index a1d8bc16958..35965e8d87d 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/AbstractHBaseVariantTableInputFormat.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/AbstractHBaseVariantTableInputFormat.java @@ -14,6 +14,7 @@ public abstract class AbstractHBaseVariantTableInputFormat extends TransformInputFormat { + public static final String SAMPLE_INDEX_TABLE = "hbase_variant_table_input_format.sample_index_table"; public static final String USE_SAMPLE_INDEX_TABLE_INPUT_FORMAT = "hbase_variant_table_input_format.use_sample_index_table_input_format"; public static final String MULTI_SCANS = "hbase_variant_table_input_format.multi_scans"; private Function converter; diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/SampleIndexTableRecordReader.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/SampleIndexTableRecordReader.java index 7e747813b0e..06df8d30154 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/SampleIndexTableRecordReader.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/SampleIndexTableRecordReader.java @@ -72,6 +72,14 @@ public SampleIndexTableRecordReader(Configuration conf) { Query query = VariantMapReduceUtil.getQueryFromConfig(conf); sampleIndexQuery = sampleIndexDBAdaptor.parseSampleIndexQuery(query); + String sampleIndexTableName = sampleIndexDBAdaptor.getSampleIndexTableName(sampleIndexQuery); + String expectedSampleIndexTable = conf.get(AbstractHBaseVariantTableInputFormat.SAMPLE_INDEX_TABLE); + if (expectedSampleIndexTable != null) { + if (!expectedSampleIndexTable.equals(sampleIndexTableName)) { + throw new IllegalArgumentException("Expected SampleIndexTable " + + expectedSampleIndexTable + " but got " + sampleIndexTableName); + } + } StudyMetadata studyMetadata = metadataManager.getStudyMetadata(sampleIndexQuery.getStudy()); allChromosomes = new TreeSet<>(VariantPhoenixKeyFactory.HBASE_KEY_CHROMOSOME_COMPARATOR); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java index 0c3ab30a697..afaa220977b 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java @@ -31,6 +31,8 @@ import org.opencb.opencga.core.config.storage.SampleIndexConfiguration; import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; +import org.opencb.opencga.storage.core.variant.query.ParsedVariantQuery; +import org.opencb.opencga.storage.core.variant.query.VariantQueryParser; import org.opencb.opencga.storage.core.variant.query.VariantQueryUtils; import org.opencb.opencga.storage.hadoop.utils.AbstractHBaseDriver; import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver; @@ -158,7 +160,10 @@ public static void initTableMapperJobFromPhoenix(Job job, String variantTable, S public static void initVariantMapperJob(Job job, Class mapperClass, String variantTable, VariantStorageMetadataManager metadataManager, Query query, QueryOptions queryOptions, boolean skipSampleIndex) throws IOException { - query = new HadoopVariantQueryParser(null, metadataManager).preProcessQuery(query, queryOptions); + VariantQueryParser variantQueryParser = new HadoopVariantQueryParser(null, metadataManager); + ParsedVariantQuery variantQuery = variantQueryParser.parseQuery(query, queryOptions); + query = variantQuery.getQuery(); + queryOptions = variantQuery.getInputOptions(); setQuery(job, query); setQueryOptions(job, queryOptions); @@ -166,26 +171,29 @@ public static void initVariantMapperJob(Job job, Class LOGGER.info("Init MapReduce job reading from HBase"); boolean useSampleIndex = !skipSampleIndex && SampleIndexQueryParser.validSampleIndexQuery(query); + String sampleIndexTable = null; if (useSampleIndex) { Object regions = query.get(VariantQueryParam.REGION.key()); Object geneRegions = query.get(VariantQueryUtils.ANNOT_GENE_REGIONS.key()); // Remove extra fields from the query - SampleIndexQuery sampleIndexQuery = new SampleIndexDBAdaptor(null, null, metadataManager).parseSampleIndexQuery(query); + SampleIndexDBAdaptor sampleIndexDBAdaptor = new SampleIndexDBAdaptor(null, null, metadataManager); + SampleIndexQuery sampleIndexQuery = sampleIndexDBAdaptor.parseSampleIndexQuery(query); setSampleIndexConfiguration(job, sampleIndexQuery.getSchema().getConfiguration(), sampleIndexQuery.getSchema().getVersion()); + sampleIndexTable = sampleIndexDBAdaptor.getSampleIndexTableName(sampleIndexQuery); // Preserve regions and gene_regions query.put(VariantQueryParam.REGION.key(), regions); query.put(VariantQueryUtils.ANNOT_GENE_REGIONS.key(), geneRegions); - LOGGER.info("Use sample index to read from HBase"); + LOGGER.info("Use sample index to read from HBase from table '{}'", sampleIndexTable); } VariantHBaseQueryParser parser = new VariantHBaseQueryParser(metadataManager); - List scans = parser.parseQueryMultiRegion(query, queryOptions); + List scans = parser.parseQueryMultiRegion(variantQuery, queryOptions); configureMapReduceScans(scans, job.getConfiguration()); - initVariantMapperJobFromHBase(job, variantTable, scans, mapperClass, useSampleIndex); + initVariantMapperJobFromHBase(job, variantTable, scans, mapperClass, useSampleIndex, sampleIndexTable); int i = 0; for (Scan scan : scans) { @@ -194,7 +202,7 @@ public static void initVariantMapperJob(Job job, Class } else { LOGGER.info("Init MapReduce job reading from Phoenix"); String sql = new VariantSqlQueryParser(variantTable, metadataManager, job.getConfiguration()) - .parse(query, queryOptions); + .parse(variantQuery, queryOptions); initVariantMapperJobFromPhoenix(job, variantTable, sql, mapperClass); } @@ -203,17 +211,12 @@ public static void initVariantMapperJob(Job job, Class public static void initVariantMapperJobFromHBase(Job job, String variantTableName, Scan scan, Class variantMapperClass) throws IOException { - initVariantMapperJobFromHBase(job, variantTableName, scan, variantMapperClass, false); - } - - public static void initVariantMapperJobFromHBase(Job job, String variantTableName, Scan scan, - Class variantMapperClass, boolean useSampleIndex) - throws IOException { - initVariantMapperJobFromHBase(job, variantTableName, Collections.singletonList(scan), variantMapperClass, useSampleIndex); + initVariantMapperJobFromHBase(job, variantTableName, Collections.singletonList(scan), variantMapperClass, false, null); } public static void initVariantMapperJobFromHBase(Job job, String variantTableName, List scans, - Class variantMapperClass, boolean useSampleIndex) + Class variantMapperClass, boolean useSampleIndex, + String sampleIndexTable) throws IOException { initTableMapperJob(job, variantTableName, scans, TableMapper.class); @@ -223,6 +226,7 @@ public static void initVariantMapperJobFromHBase(Job job, String variantTableNam job.setInputFormatClass(HBaseVariantTableInputFormat.class); job.getConfiguration().setBoolean(HBaseVariantTableInputFormat.MULTI_SCANS, scans.size() > 1); job.getConfiguration().setBoolean(HBaseVariantTableInputFormat.USE_SAMPLE_INDEX_TABLE_INPUT_FORMAT, useSampleIndex); + job.getConfiguration().set(HBaseVariantTableInputFormat.SAMPLE_INDEX_TABLE, sampleIndexTable); } public static void initVariantMapperJobFromPhoenix(Job job, VariantHadoopDBAdaptor dbAdaptor, @@ -279,21 +283,24 @@ public static void initVariantRowMapperJob(Job job, Class scans = parser.parseQueryMultiRegion(query, queryOptions); configureMapReduceScans(scans, job.getConfiguration()); - initVariantRowMapperJobFromHBase(job, variantTable, scans, mapperClass, useSampleIndex); + initVariantRowMapperJobFromHBase(job, variantTable, scans, mapperClass, useSampleIndex, sampleIndexTable); int i = 0; for (Scan scan : scans) { @@ -328,17 +335,12 @@ public static void setObjectMap(Job job, ObjectMap objectMap) { public static void initVariantRowMapperJobFromHBase(Job job, String variantTableName, Scan scan, Class variantMapperClass) throws IOException { - initVariantRowMapperJobFromHBase(job, variantTableName, scan, variantMapperClass, false); - } - - public static void initVariantRowMapperJobFromHBase(Job job, String variantTableName, Scan scan, - Class variantMapperClass, boolean useSampleIndex) - throws IOException { - initVariantRowMapperJobFromHBase(job, variantTableName, Collections.singletonList(scan), variantMapperClass, useSampleIndex); + initVariantRowMapperJobFromHBase(job, variantTableName, Collections.singletonList(scan), variantMapperClass, false, null); } public static void initVariantRowMapperJobFromHBase(Job job, String variantTableName, List scans, - Class variantMapperClass, boolean useSampleIndex) + Class variantMapperClass, boolean useSampleIndex, + String sampleIndexTable) throws IOException { initTableMapperJob(job, variantTableName, scans, TableMapper.class); @@ -348,6 +350,7 @@ public static void initVariantRowMapperJobFromHBase(Job job, String variantTable job.setInputFormatClass(HBaseVariantRowTableInputFormat.class); job.getConfiguration().setBoolean(HBaseVariantRowTableInputFormat.MULTI_SCANS, scans.size() > 1); job.getConfiguration().setBoolean(HBaseVariantRowTableInputFormat.USE_SAMPLE_INDEX_TABLE_INPUT_FORMAT, useSampleIndex); + job.getConfiguration().set(HBaseVariantRowTableInputFormat.SAMPLE_INDEX_TABLE, sampleIndexTable); } public static void initVariantRowMapperJobFromPhoenix(Job job, VariantHadoopDBAdaptor dbAdaptor, From 2a39303f1e39c0a980adffbd877748baf44e6b94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 12 Nov 2024 10:09:58 +0000 Subject: [PATCH 33/66] storage: Fix NoClassDefFoundError tephra. #TASK-7194 #TASK-6722 --- opencga-app/app/conf/opencga-env.sh | 4 +- .../opencga-storage-hadoop-core/pom.xml | 6 +++ .../hadoop/utils/AbstractHBaseDriver.java | 1 + .../variant/AbstractVariantsTableDriver.java | 12 ++--- .../variant/mr/VariantMapReduceUtil.java | 45 ++++++++++--------- 5 files changed, 41 insertions(+), 27 deletions(-) diff --git a/opencga-app/app/conf/opencga-env.sh b/opencga-app/app/conf/opencga-env.sh index d2aa76b11e2..475dddb8f39 100644 --- a/opencga-app/app/conf/opencga-env.sh +++ b/opencga-app/app/conf/opencga-env.sh @@ -104,6 +104,8 @@ if ( ls "${BASEDIR}"/libs/opencga-storage-hadoop-core-*.jar >/dev/null 2>&1 ) ; jackson=$(find "${BASEDIR}/libs/" -name "jackson-*-2.[0-9].[0-9].jar" | tr "\n" ":") proto=$(find "${BASEDIR}/libs/" -name "protobuf-java-*.jar" | tr "\n" ":") avro=$(find "${BASEDIR}/libs/" -name "avro-*.jar" | tr "\n" ":") - export HADOOP_CLASSPATH="${phoenix}:${proto}:${avro}:${jackson}:${CLASSPATH_PREFIX}" + tephra=$(find ${BASEDIR}/libs/ -name "tephra-*.jar" | tr "\n" ":") + disruptor=$(find ${BASEDIR}/libs/ -name "disruptor-*.jar" | tr "\n" ":") + export HADOOP_CLASSPATH="${phoenix}:${proto}:${avro}:${jackson}:${tephra}:${disruptor}:${CLASSPATH_PREFIX}" export HADOOP_USER_CLASSPATH_FIRST=true fi \ No newline at end of file diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/pom.xml b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/pom.xml index ddc5d8e9792..a024b4ec3fe 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/pom.xml +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/pom.xml @@ -441,6 +441,12 @@ commons-lang:commons-lang + + org.apache.tephra:tephra-core + + + com.lmax:disruptor + diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index d2d7a682eb6..b3747039904 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -215,6 +215,7 @@ public final int run(String[] args) throws Exception { } } LOGGER.info("================================================="); + LOGGER.info("tmpjars=" + Arrays.toString(job.getConfiguration().getStrings("tmpjars"))); reportRunningJobs(); boolean succeed = executeJob(job); if (!succeed) { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/AbstractVariantsTableDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/AbstractVariantsTableDriver.java index 35305dd4fdc..356df234d34 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/AbstractVariantsTableDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/AbstractVariantsTableDriver.java @@ -81,11 +81,6 @@ protected void parseAndValidateParameters() throws IOException { String archiveTable = getArchiveTable(); String variantTable = getVariantsTable(); - int maxKeyValueSize = conf.getInt(HadoopVariantStorageOptions.MR_HBASE_KEYVALUE_SIZE_MAX.key(), - HadoopVariantStorageOptions.MR_HBASE_KEYVALUE_SIZE_MAX.defaultValue()); - logger.info("HBASE: set " + ConnectionConfiguration.MAX_KEYVALUE_SIZE_KEY + " to " + maxKeyValueSize); - conf.setInt(ConnectionConfiguration.MAX_KEYVALUE_SIZE_KEY, maxKeyValueSize); // always overwrite server default (usually 1MB) - /* -------------------------------*/ // Validate parameters CHECK // if (StringUtils.isEmpty(archiveTable)) { @@ -114,11 +109,16 @@ protected void parseAndValidateParameters() throws IOException { checkTablesExist(hBaseManager, variantTable); } + int maxKeyValueSize = conf.getInt(HadoopVariantStorageOptions.MR_HBASE_KEYVALUE_SIZE_MAX.key(), + HadoopVariantStorageOptions.MR_HBASE_KEYVALUE_SIZE_MAX.defaultValue()); + logger.info("HBASE: set " + ConnectionConfiguration.MAX_KEYVALUE_SIZE_KEY + " to " + maxKeyValueSize); + conf.setInt(ConnectionConfiguration.MAX_KEYVALUE_SIZE_KEY, maxKeyValueSize); // always overwrite server default (usually 1MB) + // Increase the ScannerTimeoutPeriod to avoid ScannerTimeoutExceptions // See opencb/opencga#352 for more info. int scannerTimeout = getConf().getInt(HadoopVariantStorageOptions.MR_HBASE_SCANNER_TIMEOUT.key(), getConf().getInt(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, HConstants.DEFAULT_HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD)); - logger.info("Set Scanner timeout to " + scannerTimeout + " ..."); + logger.info("HBASE: set Scanner timeout to " + scannerTimeout + " ..."); conf.setInt(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, scannerTimeout); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java index afaa220977b..d6949e460d3 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java @@ -3,6 +3,7 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; +import com.lmax.disruptor.EventFactory; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.client.Mutation; @@ -21,7 +22,9 @@ import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.TaskType; +import org.apache.hadoop.mapreduce.lib.db.DBWritable; import org.apache.phoenix.mapreduce.util.PhoenixMapReduceUtil; +import org.apache.tephra.TransactionSystemClient; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; @@ -47,6 +50,7 @@ import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexQueryParser; import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexSchema; +import org.opencb.opencga.storage.hadoop.variant.utils.HBaseVariantTableNameGenerator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -147,16 +151,6 @@ public static void initTableMapperJob(Job job, String inTable, List scans, } } - public static void initTableMapperJobFromPhoenix(Job job, String variantTable, String sql, - Class mapper) { - job.setInputFormatClass(CustomPhoenixInputFormat.class); - - LOGGER.info(sql); - PhoenixMapReduceUtil.setInput(job, ExposedResultSetDBWritable.class, variantTable, sql); - job.setMapperClass(mapper); - - } - public static void initVariantMapperJob(Job job, Class mapperClass, String variantTable, VariantStorageMetadataManager metadataManager, Query query, QueryOptions queryOptions, boolean skipSampleIndex) throws IOException { @@ -251,12 +245,8 @@ public static void initVariantMapperJobFromPhoenix(Job job, String variantTableN Class variantMapperClass) throws IOException { // VariantDBWritable is the DBWritable class that enables us to process the Result of the query - PhoenixMapReduceUtil.setInput(job, PhoenixVariantTableInputFormat.VariantDBWritable.class, variantTableName, sqlQuery); - - LOGGER.info(sqlQuery); - job.setMapperClass(variantMapperClass); - - job.setInputFormatClass(PhoenixVariantTableInputFormat.class); + initVariantMapperJobFromPhoenix(job, variantTableName, sqlQuery, variantMapperClass, + PhoenixVariantTableInputFormat.VariantDBWritable.class, PhoenixVariantTableInputFormat.class); } public static void initVariantRowMapperJob(Job job, Class mapperClass, String variantTable, @@ -374,13 +364,28 @@ public static void initVariantRowMapperJobFromPhoenix(Job job, VariantHadoopDBAd public static void initVariantRowMapperJobFromPhoenix(Job job, String variantTableName, String sqlQuery, Class variantMapperClass) throws IOException { + initVariantMapperJobFromPhoenix(job, variantTableName, sqlQuery, variantMapperClass, + ExposedResultSetDBWritable.class, PhoenixVariantRowTableInputFormat.class); + } - LOGGER.info(sqlQuery); - // VariantDBWritable is the DBWritable class that enables us to process the Result of the query - PhoenixMapReduceUtil.setInput(job, ExposedResultSetDBWritable.class, variantTableName, sqlQuery); + private static void initVariantMapperJobFromPhoenix(Job job, String variantTableName, String sqlQuery, + Class variantMapperClass, Class inputClass, + Class inputFormatClass) throws IOException { + boolean addDependencyJar = job.getConfiguration().getBoolean( + HadoopVariantStorageOptions.MR_ADD_DEPENDENCY_JARS.key(), + HadoopVariantStorageOptions.MR_ADD_DEPENDENCY_JARS.defaultValue()); + if (addDependencyJar) { + TableMapReduceUtil.addDependencyJars(job); + TableMapReduceUtil.addDependencyJarsForClasses(job.getConfiguration(), + TransactionSystemClient.class, + EventFactory.class); + } + + LOGGER.info(sqlQuery); + PhoenixMapReduceUtil.setInput(job, inputClass, variantTableName, sqlQuery); job.setMapperClass(variantMapperClass); - job.setInputFormatClass(PhoenixVariantRowTableInputFormat.class); + job.setInputFormatClass(inputFormatClass); } public static void setNoneReduce(Job job) throws IOException { From ae26598d606a50002b357668f075d3d130111ae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 12 Nov 2024 10:13:56 +0000 Subject: [PATCH 34/66] storage: Fix NPE exporting from sampleindex. #TASK-6722 --- .../storage/core/variant/adaptors/VariantField.java | 9 +++++++-- .../storage/hadoop/variant/mr/VariantMapReduceUtil.java | 8 ++++++-- .../variant/utils/HBaseVariantTableNameGenerator.java | 5 +++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantField.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantField.java index 8c208b1cd30..72abce21c7c 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantField.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantField.java @@ -16,6 +16,7 @@ package org.opencb.opencga.storage.core.variant.adaptors; +import org.apache.solr.common.StringUtils; import org.opencb.commons.datastore.core.QueryOptions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -197,11 +198,11 @@ public static Set getIncludeFields(QueryOptions options) { } List includeList = options.getAsStringList(QueryOptions.INCLUDE); - if (includeList != null && !includeList.isEmpty()) { + if (!isEmpty(includeList)) { includeFields = parseInclude(includeList); } else { List excludeList = options.getAsStringList(QueryOptions.EXCLUDE); - if (excludeList != null && !excludeList.isEmpty()) { + if (!isEmpty(excludeList)) { includeFields = parseExclude(excludeList); } else { includeFields = new HashSet<>(Arrays.asList(values())); @@ -214,6 +215,10 @@ public static Set getIncludeFields(QueryOptions options) { return includeFields; } + private static boolean isEmpty(List stringList) { + return stringList == null || stringList.isEmpty() || (stringList.size() == 1 && StringUtils.isEmpty(stringList.get(0))); + } + public static Set parseInclude(String... includeList) { return parseInclude(Arrays.asList(includeList)); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java index d6949e460d3..c55dfa6282f 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java @@ -170,7 +170,9 @@ public static void initVariantMapperJob(Job job, Class Object regions = query.get(VariantQueryParam.REGION.key()); Object geneRegions = query.get(VariantQueryUtils.ANNOT_GENE_REGIONS.key()); // Remove extra fields from the query - SampleIndexDBAdaptor sampleIndexDBAdaptor = new SampleIndexDBAdaptor(null, null, metadataManager); + HBaseVariantTableNameGenerator tableNameGenerator = HBaseVariantTableNameGenerator + .fromVariantsTable(variantTable, job.getConfiguration()); + SampleIndexDBAdaptor sampleIndexDBAdaptor = new SampleIndexDBAdaptor(null, tableNameGenerator, metadataManager); SampleIndexQuery sampleIndexQuery = sampleIndexDBAdaptor.parseSampleIndexQuery(query); setSampleIndexConfiguration(job, sampleIndexQuery.getSchema().getConfiguration(), @@ -276,7 +278,9 @@ public static void initVariantRowMapperJob(Job job, Class Date: Mon, 18 Nov 2024 15:47:54 +0000 Subject: [PATCH 35/66] storage: Ensure variant-exports are sorted even from Phoenix. #TASK-6722 --- .../opencga/core/common/ExceptionUtils.java | 25 +++++++++- .../hadoop/utils/AbstractHBaseDriver.java | 47 +++++++++++++++++-- .../hadoop/variant/executors/MRExecutor.java | 2 +- .../hadoop/variant/io/VariantDriver.java | 5 ++ .../variant/mr/StreamVariantMapper.java | 14 +++++- .../variant/mr/VariantMapReduceUtil.java | 1 + 6 files changed, 86 insertions(+), 8 deletions(-) diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/common/ExceptionUtils.java b/opencga-core/src/main/java/org/opencb/opencga/core/common/ExceptionUtils.java index 89e3a6fba43..d17034d553d 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/common/ExceptionUtils.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/common/ExceptionUtils.java @@ -58,7 +58,30 @@ private static StringBuilder prettyExceptionMessage(Throwable exception, StringB if (includeClassName) { message.append("[").append(exception.getClass().getSimpleName()).append("] "); } - message.append(exMessage); + String[] exMessageSubLines; + if (exMessage != null) { + exMessageSubLines = exMessage.split("\n"); + } else { + exMessageSubLines = new String[]{"null"}; + } + if (multiline) { + for (int i = 0; i < exMessageSubLines.length; i++) { + String exMessageSubLine = exMessageSubLines[i]; + if (i == 0) { + message.append(exMessageSubLine); + } else { + message.append(separator); + if (includeClassName) { + message.append(StringUtils.repeat(" ", exception.getClass().getSimpleName().length() + 3)); + } + message.append(exMessageSubLine); + } + } + } else { + for (String exMessageSubLine : exMessageSubLines) { + message.append(exMessageSubLine).append(" ; "); + } + } if (exception.getSuppressed().length > 0) { StringBuilder sb = new StringBuilder(); String intraSeparator = multiline ? separator + " " : separator; diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index b3747039904..b3bcbadbe82 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -221,10 +221,11 @@ public final int run(String[] args) throws Exception { if (!succeed) { LOGGER.error("error with job!"); if (!"NA".equals(job.getStatus().getFailureInfo())) { - LOGGER.error("Failure info: " + job.getStatus().getFailureInfo()); - printKeyValue(ERROR_MESSAGE, job.getStatus().getFailureInfo()); + String errorMessage = job.getStatus().getFailureInfo().replace("\n", "\\n"); + errorMessage += getExtendedTaskErrorMessage(job); + LOGGER.error("Failure info: " + errorMessage.replace("\\n", "\n")); + printKeyValue(ERROR_MESSAGE, errorMessage); } - } LOGGER.info("================================================="); LOGGER.info("Finish job " + getJobName()); @@ -239,6 +240,43 @@ public final int run(String[] args) throws Exception { return succeed ? 0 : 1; } + private static String getExtendedTaskErrorMessage(Job job) { + try { + StringBuilder sb = new StringBuilder(); + int eventCounter = 0; + TaskCompletionEvent[] events; + do { + events = job.getTaskCompletionEvents(eventCounter, 10); + eventCounter += events.length; + for (TaskCompletionEvent event : events) { + if (event.getStatus() == TaskCompletionEvent.Status.FAILED) { + LOGGER.info(event.toString()); + // Displaying the task diagnostic information + TaskAttemptID taskId = event.getTaskAttemptId(); + String[] taskDiagnostics = job.getTaskDiagnostics(taskId); + if (taskDiagnostics != null) { + for (String diagnostics : taskDiagnostics) { + for (String diagnosticLine : diagnostics.split("\n")) { + if (diagnosticLine.contains("Error:") + || diagnosticLine.contains("Caused by:") + || diagnosticLine.contains("Suppressed:")) { + sb.append(diagnosticLine); + sb.append("\\n"); + } + } + } + } + } + } + } while (events.length > 0); + return sb.toString(); + } catch (Exception e) { + // Ignore + LOGGER.error("Error getting task diagnostics", e); + } + return ""; + } + private void reportRunningJobs() { if (getConf().getBoolean("storage.hadoop.mr.skipReportRunningJobs", false)) { LOGGER.info("Skip report running jobs"); @@ -316,8 +354,9 @@ private boolean executeJob(Job job) throws IOException, InterruptedException, Cl } }); try { - Runtime.getRuntime().addShutdownHook(hook); job.submit(); + // Add shutdown hook after successfully submitting the job. + Runtime.getRuntime().addShutdownHook(hook); JobID jobID = job.getJobID(); String applicationId = jobID.appendTo(new StringBuilder(ApplicationId.appIdStrPrefix)).toString(); printKeyValue(MR_APPLICATION_ID, applicationId); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutor.java index a8d9b03745c..f8c85a813f6 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutor.java @@ -121,7 +121,7 @@ public ObjectMap run(Class execClass, String[] args, String if (exitValue != 0) { String message = "Error executing MapReduce for : \"" + taskDescription + "\""; if (StringUtils.isNotEmpty(result.getErrorMessage())) { - message += " : " + result.getErrorMessage(); + message += " : " + result.getErrorMessage().replace("\\n", "\n"); } else { message += " : Unidentified error executing MapReduce job. Check logs for more information."; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java index 4a78cb70f69..8916c5242aa 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java @@ -56,6 +56,11 @@ protected void parseAndValidateParameters() throws IOException { getQueryFromConfig(query, getConf()); getQueryOptionsFromConfig(options, getConf()); + if (!options.getBoolean(QueryOptions.SORT)) { + // Unsorted results might break the file generation. + // Results from HBase are always sorted, but when reading from Phoenix, some results might be out of order. + options.put(QueryOptions.SORT, true); + } logger.info(" * Query:"); for (Map.Entry entry : query.entrySet()) { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index b490f251508..5132d49a7b0 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -365,8 +365,18 @@ private void startProcess(Context context) throws IOException, StorageEngineExce context.getCounter(COUNTER_GROUP_NAME, "start_process").increment(1); Variant variant = context.getCurrentValue(); - currentChromosome = variant.getChromosome(); - currentPosition = variant.getStart(); + if (variant.getChromosome().equals(currentChromosome)) { + if (currentPosition >= variant.getStart()) { + // Multiple variants might point to the same locus + // In that case, simply increment the position + currentPosition++; + } else { + currentPosition = variant.getStart(); + } + } else { + currentChromosome = variant.getChromosome(); + currentPosition = variant.getStart(); + } if (firstVariant == null) { firstVariant = variant.getChromosome() + ":" + variant.getStart(); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java index c55dfa6282f..77786498ed5 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java @@ -496,6 +496,7 @@ public static QueryOptions getQueryOptionsFromConfig(Configuration conf) { public static void getQueryOptionsFromConfig(QueryOptions options, Configuration conf) { options.put(QueryOptions.INCLUDE, conf.get(QueryOptions.INCLUDE)); options.put(QueryOptions.EXCLUDE, conf.get(QueryOptions.EXCLUDE)); + options.put(QueryOptions.SORT, conf.get(QueryOptions.SORT)); } public static Query getQueryFromConfig(Configuration conf) { From 0a741d5a1c3acd2eadca5614261bad965a2837d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Mon, 25 Nov 2024 13:14:11 +0000 Subject: [PATCH 36/66] storage: Use HDFS to store intermediate MapReduce files. Concat locally. #TASK-6722 --- .../main/resources/storage-configuration.yml | 2 +- .../opencga-storage-hadoop-core/pom.xml | 4 + .../storage/hadoop/app/HadoopMain.java | 139 +++++ .../opencga/storage/hadoop/app/Main.java | 3 + .../hadoop/utils/AbstractHBaseDriver.java | 338 +------------ .../hadoop/utils/DeleteHBaseColumnDriver.java | 2 +- .../hadoop/utils/MapReduceOutputFile.java | 478 ++++++++++++++++++ .../variant/AbstractVariantsTableDriver.java | 23 + .../variant/HadoopVariantStorageEngine.java | 2 +- .../VariantTableAggregationDriver.java | 40 +- .../analysis/gwas/FisherTestDriver.java | 40 +- .../hadoop/variant/executors/MRExecutor.java | 7 +- .../variant/executors/MRExecutorFactory.java | 19 +- .../variant/executors/SshMRExecutor.java | 91 +++- .../variant/io/HadoopVariantExporter.java | 5 +- .../hadoop/variant/io/VariantDriver.java | 5 +- .../variant/io/VariantExporterDriver.java | 4 +- .../variant/mr/StreamVariantDriver.java | 4 +- .../variant/mr/VariantMapReduceUtil.java | 4 +- .../variant/prune/VariantPruneDriver.java | 7 +- .../stats/CohortVariantStatsDriver.java | 3 - .../stats/SampleVariantStatsDriver.java | 2 +- .../variant/stats/VariantStatsDriver.java | 11 +- .../variant/executors/SshMRExecutorTest.java | 7 +- pom.xml | 2 +- 25 files changed, 796 insertions(+), 446 deletions(-) create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/HadoopMain.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java diff --git a/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml b/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml index f422770d9b1..dfa6865eb40 100644 --- a/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml +++ b/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml @@ -177,7 +177,7 @@ variant: storage.hadoop.mr.executor.ssh.user: "" # Hadoop edge node user name #storage.hadoop.mr.executor.ssh.key: "~/.ssh/id_rsa" # Hadoop edge node ssh-key file storage.hadoop.mr.executor.ssh.password: "" # Hadoop edge node password. Only if ssh-key is not present. Requires sshpass to run - storage.hadoop.mr.executor.ssh.remoteOpenCgaHome: # Remote opencga home location. Only if different than local location. + storage.hadoop.mr.executor.ssh.remoteOpenCgaHome: # Remote opencga home location. Only if different from local location. storage.hadoop.mr.executor.ssh.terminationGracePeriodSeconds: 120 # Termination grace period in seconds for the ssh executor. # Increase the ScannerTimeoutPeriod from 60000 (1min) to 300000 (5min) to avoid ScannerTimeoutExceptions diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/pom.xml b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/pom.xml index a024b4ec3fe..1d0a1cb302b 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/pom.xml +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/pom.xml @@ -212,6 +212,10 @@ com.google.guava guava + + org.xerial.snappy + snappy-java + org.apache.parquet parquet-avro diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/HadoopMain.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/HadoopMain.java new file mode 100644 index 00000000000..c5035c575c5 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/HadoopMain.java @@ -0,0 +1,139 @@ +package org.opencb.opencga.storage.hadoop.app; + +import org.apache.commons.lang3.RandomStringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.util.ReflectionUtils; +import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.opencga.core.common.IOUtils; +import org.opencb.opencga.storage.hadoop.utils.MapReduceOutputFile; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Collections; +import java.util.Map; + +public class HadoopMain extends AbstractMain { + + + @Override + protected void run(String[] args) throws Exception { + new HadoopCommandExecutor().exec(args); + } + + + public static class HadoopCommandExecutor extends NestedCommandExecutor { +// private HBaseManager hBaseManager; + private Configuration conf; + + public HadoopCommandExecutor() { + this(""); + } + + public HadoopCommandExecutor(String context) { + super(context); + addSubCommand(Arrays.asList("hdfs-ls", "ls"), + " [-f ] [-D key=value] : List the content of an hdfs path", this::hdfsLs); + addSubCommand(Arrays.asList("hdfs-info", "info", "st"), + " [-f ] [-D key=value] : FS information", this::info); + addSubCommand(Collections.singletonList("codec-info"), + " [-c ] [-D key=value] : Codec information", this::codecInfo); + } + + @Override + protected void setup(String command, String[] args) throws Exception { + conf = new Configuration(); + } + + @Override + protected void cleanup(String command, String[] args) throws Exception { + } + + private void hdfsLs(String[] args) throws Exception { + ObjectMap map = getArgsMap(args, "f", "D"); + String path = map.getString("f", FileSystem.getDefaultUri(conf).toString()); + addDynamic(map); + + try (FileSystem fs = FileSystem.get(new Path(path).toUri(), conf)) { + RemoteIterator iterator = fs.listFiles(new Path(path), true); + while (iterator.hasNext()) { + LocatedFileStatus file = iterator.next(); + println("- " + file.getPath().toUri() + " : " + IOUtils.humanReadableByteCount(file.getLen(), false)); + } + } + } + + private void info(String[] args) throws Exception { + ObjectMap map = getArgsMap(args, "f", "D"); + String path = map.getString("f", FileSystem.getDefaultUri(conf).toString()); + addDynamic(map); + + try (FileSystem fs = FileSystem.get(new Path(path).toUri(), conf)) { + info(fs); + } + } + + private void addDynamic(ObjectMap map) { + Map dynamic = map.getMap("D", Collections.emptyMap()); + if (dynamic != null) { + for (Map.Entry entry : dynamic.entrySet()) { + conf.set(entry.getKey(), entry.getValue().toString()); + } + } + } + + private void info(FileSystem fs) throws Exception { + println("fs.getScheme() = " + fs.getScheme()); + println("fs.getUri() = " + fs.getUri()); + println("fs.getHomeDirectory() = " + fs.getHomeDirectory()); + println("fs.getWorkingDirectory() = " + fs.getWorkingDirectory()); + println("fs.getConf() = " + fs.getConf()); + println("fs.getCanonicalServiceName() = " + fs.getCanonicalServiceName()); + FsStatus status = fs.getStatus(); + println("status.getCapacity() = " + IOUtils.humanReadableByteCount(status.getCapacity(), false)); + println("status.getRemaining() = " + IOUtils.humanReadableByteCount(status.getRemaining(), false)); + println("status.getUsed() = " + IOUtils.humanReadableByteCount(status.getUsed(), false)); + } + + private void codecInfo(String[] args) throws Exception { + ObjectMap map = getArgsMap(args, "c", "D"); + String codecName = map.getString("c", "deflate"); + addDynamic(map); + + CompressionCodec codec; + try { + Class aClass = Class.forName(codecName); + codec = (CompressionCodec) ReflectionUtils.newInstance(aClass, conf); + } catch (ClassNotFoundException | ClassCastException e) { + codec = MapReduceOutputFile.getCompressionCodec(codecName, conf); + } + println("Codec name : " + codecName); + if (codec == null) { + println("Codec not found!"); + } else { + println("Codec class : " + codec.getClass()); + println("Default extension : " + codec.getDefaultExtension()); + println("Compressor type : " + codec.getCompressorType()); + println("Decompressor type : " + codec.getDecompressorType()); + int rawSize = 1024 * 1024 * 10; + InputStream is = new ByteArrayInputStream(RandomStringUtils.randomAlphanumeric(rawSize).getBytes(StandardCharsets.UTF_8)); + ByteArrayOutputStream byteOs = new ByteArrayOutputStream(rawSize); + OutputStream os = codec.createOutputStream(byteOs); + org.apache.commons.io.IOUtils.copy(is, os); + int compressedSize = byteOs.size(); + + println("Compression rate : " + + IOUtils.humanReadableByteCount(rawSize, false) + "(" + rawSize + "B) " + + "-> " + + IOUtils.humanReadableByteCount(compressedSize, false) + "(" + compressedSize + "B) " + + String.format("%.3f", ((double) compressedSize) / ((double) rawSize))); + os.close(); + } + } + } +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/Main.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/Main.java index 017590c3f8f..d5d829c6cfa 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/Main.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/Main.java @@ -28,6 +28,9 @@ public static void main(String[] mainArgs) throws Exception { executor.addSubCommand(Arrays.asList("convertintovirtual", "ConvertIntoVirtual"), "Migrate into virtual file", args -> { new ConvertIntoVirtual().run(args); }); + executor.addSubCommand(Arrays.asList("hadoop", "hdfs"), "Run hadoop commands", args -> { + new HadoopMain().run(args); + }); executor.exec(mainArgs); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java index b3bcbadbe82..787b10648dc 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/AbstractHBaseDriver.java @@ -1,27 +1,22 @@ package org.opencb.opencga.storage.hadoop.utils; -import org.apache.commons.io.input.ReaderInputStream; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hbase.mapreduce.TableInputFormat; import org.apache.hadoop.hbase.mapreduce.TableOutputFormat; -import org.apache.hadoop.io.IOUtils; -import org.apache.hadoop.io.compress.*; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.yarn.api.records.ApplicationId; @@ -29,29 +24,20 @@ import org.apache.hadoop.yarn.api.records.YarnApplicationState; import org.apache.hadoop.yarn.client.api.YarnClient; import org.apache.hadoop.yarn.exceptions.YarnException; -import org.apache.parquet.hadoop.ParquetFileWriter; import org.apache.phoenix.mapreduce.util.PhoenixConfigurationUtil; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.opencga.core.common.ExceptionUtils; import org.opencb.opencga.core.common.TimeUtils; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; -import org.opencb.opencga.storage.hadoop.io.HDFSIOConnector; -import org.opencb.opencga.storage.hadoop.variant.executors.SshMRExecutor; import org.opencb.opencga.storage.hadoop.variant.mr.AbstractHBaseVariantTableInputFormat; import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.*; -import java.net.URI; -import java.nio.charset.Charset; -import java.nio.file.Files; -import java.nio.file.Paths; +import java.io.IOException; import java.util.*; -import java.util.function.Supplier; import java.util.stream.Collectors; -import static org.opencb.opencga.core.common.IOUtils.humanReadableByteCount; import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageOptions.MR_EXECUTOR_SSH_PASSWORD; /** @@ -64,6 +50,7 @@ public abstract class AbstractHBaseDriver extends Configured implements Tool { public static final String COLUMNS_TO_COUNT = "columns_to_count"; public static final String MR_APPLICATION_ID = "MR_APPLICATION_ID"; public static final String ERROR_MESSAGE = "ERROR_MESSAGE"; + public static final String OUTPUT_PARAM = "output"; private static final Logger LOGGER = LoggerFactory.getLogger(AbstractHBaseDriver.class); protected String table; @@ -382,91 +369,6 @@ protected static void printKeyValue(String key, Object value) { System.err.println(key + "=" + value); } - protected boolean isLocal(Path path) { - return HDFSIOConnector.isLocal(path.toUri(), getConf()); - } - - protected Path getTempOutdir(String prefix) throws IOException { - return getTempOutdir(prefix, ""); - } - - protected Path getTempOutdir(String prefix, String suffix) throws IOException { - return getTempOutdir(prefix, suffix, false); - } - - protected Path getTempOutdir(String prefix, String suffix, boolean ensureHdfs) throws IOException { - if (StringUtils.isEmpty(suffix)) { - suffix = ""; - } else if (!suffix.startsWith(".")) { - suffix = "." + suffix; - } - // Be aware that - // > ABFS does not allow files or directories to end with a dot. - String fileName = prefix + "." + TimeUtils.getTime() + suffix; - - Path tmpDir = new Path(getConf().get("hadoop.tmp.dir")); - if (ensureHdfs) { - FileSystem fileSystem = tmpDir.getFileSystem(getConf()); - if (!fileSystem.getScheme().equals("hdfs")) { - LOGGER.info("Temporary directory is not in hdfs:// . Hdfs is required for this temporary file."); - LOGGER.info(" Default file system : " + fileSystem.getUri()); - for (String nameServiceId : getConf().getTrimmedStringCollection("dfs.nameservices")) { - try { - Path hdfsTmpPath = new Path("hdfs", nameServiceId, "/tmp/"); - FileSystem hdfsFileSystem = hdfsTmpPath.getFileSystem(getConf()); - if (hdfsFileSystem != null) { - LOGGER.info("Change to file system : " + hdfsFileSystem.getUri()); - tmpDir = hdfsTmpPath; - break; - } - } catch (Exception e) { - LOGGER.debug("This file system is not hdfs:// . Skip!", e); - } - } - } - } - return new Path(tmpDir, fileName); - } - - private URI toUri(Path path) throws IOException { - URI tmpUri = path.toUri(); - if (tmpUri.getScheme() == null) { - // If the scheme is null, add the default scheme - FileSystem fileSystem = path.getFileSystem(getConf()); - tmpUri = fileSystem.getUri().resolve(tmpUri.getPath()); - } - return tmpUri; - } - - protected Path getLocalOutput(Path outdir) throws IOException { - return getLocalOutput(outdir, () -> null); - } - - protected Path getLocalOutput(Path outdir, Supplier nameGenerator) throws IOException { - if (!isLocal(outdir)) { - throw new IllegalArgumentException("Outdir " + outdir + " is not in the local filesystem"); - } - Path localOutput = outdir; - FileSystem localFs = localOutput.getFileSystem(getConf()); - if (localFs.exists(localOutput)) { - if (localFs.isDirectory(localOutput)) { - String name = nameGenerator.get(); - if (StringUtils.isEmpty(name)) { - throw new IllegalArgumentException("Local output '" + localOutput + "' is a directory"); - } - localOutput = new Path(localOutput, name); - } else { - throw new IllegalArgumentException("File '" + localOutput + "' already exists!"); - } - } else { - if (!localFs.exists(localOutput.getParent())) { - Files.createDirectories(Paths.get(localOutput.getParent().toUri())); -// throw new IOException("No such file or directory: " + localOutput); - } - } - return localOutput; - } - protected void deleteTemporaryFile(Path outdir) throws IOException { LOGGER.info("Delete temporary file " + outdir.toUri()); FileSystem fileSystem = outdir.getFileSystem(getConf()); @@ -475,238 +377,6 @@ protected void deleteTemporaryFile(Path outdir) throws IOException { LOGGER.info("Temporary file deleted!"); } - public class MapReduceOutputFile { - public static final String OUTPUT_PARAM = "output"; - - private final Supplier nameGenerator; - private final String tempFilePrefix; - private final Map extraFiles = new HashMap<>(); - private String namedOutput; - protected Path localOutput; - protected Path outdir; - - public MapReduceOutputFile(String tempFilePrefix) throws IOException { - this.nameGenerator = () -> null; - this.tempFilePrefix = tempFilePrefix; - initOutputPath(); - namedOutput = null; - } - - public MapReduceOutputFile(Supplier nameGenerator, String tempFilePrefix) throws IOException { - this.nameGenerator = nameGenerator; - this.tempFilePrefix = tempFilePrefix; - initOutputPath(); - namedOutput = null; - } - - private void initOutputPath() throws IOException { - String outdirStr = getParam(OUTPUT_PARAM); - if (StringUtils.isNotEmpty(outdirStr)) { - outdir = new Path(outdirStr); - - if (isLocal(outdir)) { - localOutput = AbstractHBaseDriver.this.getLocalOutput(outdir, nameGenerator); - outdir = getTempOutdir(tempFilePrefix, localOutput.getName()); - outdir.getFileSystem(getConf()).deleteOnExit(outdir); - } - if (hasTempOutput()) { - LOGGER.info(" * Output file : " + toUri(localOutput)); - LOGGER.info(" * Temporary outdir : " + toUri(outdir)); - } else { - LOGGER.info(" * Outdir: " + toUri(outdir)); - } - } - } - - public void postExecute(boolean succeed) throws IOException { - if (succeed) { - if (hasTempOutput()) { - getConcatMrOutputToLocal(); - } - } - if (hasTempOutput()) { - deleteTemporaryFile(outdir); - } - } - - public boolean hasTempOutput() { - return localOutput != null; - } - - public MapReduceOutputFile setNamedOutput(String partFilePrefix) { - this.namedOutput = partFilePrefix; - return this; - } - - public void addExtraNamedOutput(String namedOutput, String localOutputPrefix) { - extraFiles.put(namedOutput, localOutputPrefix); - } - - protected void getConcatMrOutputToLocal() throws IOException { - concatMrOutputToLocal(outdir, localOutput, true, namedOutput); - - for (Map.Entry entry : extraFiles.entrySet()) { - String suffix = entry.getValue(); - String partFilePrefix = entry.getKey(); - Path extraOutput = localOutput.suffix(suffix); - concatMrOutputToLocal(outdir, extraOutput, true, partFilePrefix); - printKeyValue(SshMRExecutor.EXTRA_OUTPUT_PREFIX + partFilePrefix.toUpperCase(), extraOutput); - } - } - - public Path getLocalOutput() { - return localOutput; - } - - public Path getOutdir() { - return outdir; - } - } - - /** - * Concatenate all generated files from a MapReduce job into one single local file. - * - * @param mrOutdir MapReduce output directory - * @param localOutput Local file - * @throws IOException on IOException - * @return List of copied files from HDFS - */ - protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput) throws IOException { - return concatMrOutputToLocal(mrOutdir, localOutput, true, null); - } - - /** - * Concatenate all generated files from a MapReduce job into one single local file. - * - * @param mrOutdir MapReduce output directory - * @param localOutput Local file - * @param removeExtraHeaders Remove header lines starting with "#" from all files but the first - * @param partFilePrefix Filter partial files with specific prefix. Otherwise, concat them all. - * @throws IOException on IOException - * @return List of copied files from HDFS - */ - protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, boolean removeExtraHeaders, String partFilePrefix) - throws IOException { - // TODO: Allow copy output to any IOConnector - FileSystem fileSystem = mrOutdir.getFileSystem(getConf()); - RemoteIterator it = fileSystem.listFiles(mrOutdir, false); - List paths = new ArrayList<>(); - while (it.hasNext()) { - LocatedFileStatus status = it.next(); - Path path = status.getPath(); - if (status.isFile() - && !path.getName().equals(FileOutputCommitter.SUCCEEDED_FILE_NAME) - && !path.getName().equals(FileOutputCommitter.PENDING_DIR_NAME) - && !path.getName().equals(ParquetFileWriter.PARQUET_METADATA_FILE) - && !path.getName().equals(ParquetFileWriter.PARQUET_COMMON_METADATA_FILE) - && status.getLen() > 0) { - if (partFilePrefix == null || path.getName().startsWith(partFilePrefix)) { - paths.add(path); - } - } - } - StopWatch stopWatch = new StopWatch(); - stopWatch.start(); - if (paths.isEmpty()) { - LOGGER.warn("The MapReduce job didn't produce any output. This may not be expected."); - } else if (paths.size() == 1) { - LOGGER.info("Copy to local file"); - LOGGER.info(" Source : {} ({})", - paths.get(0).toUri(), humanReadableByteCount(fileSystem.getFileStatus(paths.get(0)).getLen(), false)); - LOGGER.info(" Target : {}", localOutput.toUri()); - fileSystem.copyToLocalFile(false, paths.get(0), localOutput); - } else { - LOGGER.info("Concat and copy to local : " + paths.size() + " partial files"); - LOGGER.info(" Source {}: {}", getCompression(paths.get(0).getName()), mrOutdir.toUri()); - LOGGER.info(" Target {}: {}", getCompression(localOutput.getName()), localOutput.toUri()); - LOGGER.info(" ---- "); - - try (OutputStream os = getOutputStreamPlain(localOutput.getName(), localOutput.getFileSystem(getConf()).create(localOutput))) { - for (int i = 0; i < paths.size(); i++) { - Path path = paths.get(i); - LOGGER.info("[{}] Concat {} file : '{}' ({}) ", - i, - getCompression(path.getName()), - path.toUri(), - humanReadableByteCount(fileSystem.getFileStatus(path).getLen(), false)); - try (InputStream isAux = getInputStream(path.getName(), fileSystem.open(path))) { - InputStream is = isAux; - // Remove extra headers from all files but the first - if (removeExtraHeaders && i != 0) { - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - String line; - do { - br.mark(10 * 1024 * 1024); //10MB - line = br.readLine(); - // Skip blank lines and - } while (line != null && (StringUtils.isBlank(line) || line.startsWith("#"))); - br.reset(); - is = new ReaderInputStream(br, Charset.defaultCharset()); - } - - IOUtils.copyBytes(is, os, getConf(), false); - } - } - } - LOGGER.info("File size : " + humanReadableByteCount(Files.size(Paths.get(localOutput.toUri())), false)); - LOGGER.info("Time to copy from HDFS and concat : " + TimeUtils.durationToString(stopWatch)); - } - return paths; - } - - private static String getCompression(String name) throws IOException { - if (name.endsWith(".gz")) { - return "gzip"; - } else if (name.endsWith(".snappy")) { - return "snappy"; - } else if (name.endsWith(".lz4")) { - return "lz4"; - } else if (name.endsWith(".zst")) { - return "ztandard"; - } else { - return "plain"; - } - } - - private OutputStream getOutputStreamPlain(String name, OutputStream fsOs) throws IOException { - CompressionCodec codec = getCompressionCodec(name); - if (codec == null) { - return fsOs; - } - return codec.createOutputStream(fsOs); - } - - private CompressionCodec getCompressionCodec(String name) throws IOException { - Class codecClass; - switch (getCompression(name)) { - case "gzip": - codecClass = GzipCodec.class; - break; - case "snappy": - codecClass = SnappyCodec.class; - break; - case "lz4": - codecClass = Lz4Codec.class; - break; - case "ztandard": - codecClass = ZStandardCodec.class; - break; - case "plain": - return null; - default: - throw new IOException("Unknown compression codec for file " + name); - } - return ReflectionUtils.newInstance(codecClass, getConf()); - } - - private InputStream getInputStream(String name, InputStream is) throws IOException { - CompressionCodec codec = getCompressionCodec(name); - if (codec == null) { - return is; - } - return codec.createInputStream(is); - } - protected final int getServersSize(String table) throws IOException { int serversSize; try (HBaseManager hBaseManager = new HBaseManager(getConf())) { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/DeleteHBaseColumnDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/DeleteHBaseColumnDriver.java index 53473a977b2..f9b3b576460 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/DeleteHBaseColumnDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/DeleteHBaseColumnDriver.java @@ -109,7 +109,7 @@ public void setupJob(Job job, String table) throws IOException { VariantMapReduceUtil.setNoneReduce(job); } else { VariantMapReduceUtil.initTableMapperJob(job, table, scans, DeleteHBaseColumnToProtoMapper.class); - outdir = getTempOutdir("opencga_delete", table, true); + outdir = MapReduceOutputFile.getTempOutdir("opencga_delete", table, true, getConf()); outdir.getFileSystem(getConf()).deleteOnExit(outdir); LOGGER.info(" * Temporary outdir file: " + outdir.toUri()); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java new file mode 100644 index 00000000000..e91be76f97e --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java @@ -0,0 +1,478 @@ +package org.opencb.opencga.storage.hadoop.utils; + +import org.apache.commons.io.input.ReaderInputStream; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.time.StopWatch; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.compress.*; +import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; +import org.apache.hadoop.util.ReflectionUtils; +import org.apache.parquet.hadoop.ParquetFileWriter; +import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.opencga.core.common.TimeUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xerial.snappy.SnappyInputStream; +import org.xerial.snappy.SnappyOutputStream; + +import java.io.*; +import java.net.URI; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Supplier; + +import static org.opencb.opencga.core.common.IOUtils.humanReadableByteCount; + +public class MapReduceOutputFile { + + public static final String EXTRA_OUTPUT_PREFIX = "EXTRA_OUTPUT_"; + public static final String NAMED_OUTPUT = "NAMED_OUTPUT"; + public static final String EXTRA_NAMED_OUTPUT_PREFIX = "EXTRA_NAMED_OUTPUT_"; + private static final Logger LOGGER = LoggerFactory.getLogger(MapReduceOutputFile.class); + + private final Configuration conf; + private final Supplier nameGenerator; + private final Map extraFiles = new HashMap<>(); + private String namedOutput; + protected Path localOutput; + protected Path outdir; + + public MapReduceOutputFile(String outdirStr, String tempFilePrefix, Configuration conf) throws IOException { + this(outdirStr, null, tempFilePrefix, conf); + } + + public MapReduceOutputFile(String outdirStr, Supplier nameGenerator, String tempFilePrefix, + Configuration conf) throws IOException { + this(outdirStr, nameGenerator, tempFilePrefix, false, conf); + } + + public MapReduceOutputFile(String outdirStr, Supplier nameGenerator, String tempFilePrefix, boolean ensureHdfs, + Configuration conf) throws IOException { + this.conf = conf; + this.nameGenerator = nameGenerator == null ? () -> null : nameGenerator; + namedOutput = null; + + outdir = new Path(outdirStr); + + if (isLocal(outdir)) { + localOutput = getLocalOutput(outdir); + outdir = getTempOutdir(tempFilePrefix, localOutput.getName(), ensureHdfs, conf); + outdir.getFileSystem(conf).deleteOnExit(outdir); + } + if (hasTempOutput()) { + LOGGER.info(" * Output file : " + toUri(localOutput)); + LOGGER.info(" * MapReduce outdir : " + toUri(outdir)); + } else { + LOGGER.info(" * MapReduce outdir : " + toUri(outdir)); + } + } + + public static Path getTempOutdir(String prefix, String suffix, boolean ensureHdfs, Configuration conf) throws IOException { + if (StringUtils.isEmpty(suffix)) { + suffix = ""; + } else if (!suffix.startsWith(".")) { + suffix = "." + suffix; + } + // Be aware that + // > ABFS does not allow files or directories to end with a dot. + String fileName = prefix + "." + TimeUtils.getTime() + suffix; + + Path tmpDir = new Path(conf.get("hadoop.tmp.dir")); + if (ensureHdfs) { + if (!isHdfs(tmpDir, conf)) { + LOGGER.info("Temporary directory is not in hdfs:// . Hdfs is required for this temporary file."); + LOGGER.info(" Default file system : " + FileSystem.getDefaultUri(conf)); + for (String nameServiceId : conf.getTrimmedStringCollection("dfs.nameservices")) { + try { + Path hdfsTmpPath = new Path("hdfs", nameServiceId, "/tmp/"); + FileSystem hdfsFileSystem = hdfsTmpPath.getFileSystem(conf); + if (hdfsFileSystem != null) { + LOGGER.info("Change to file system : " + hdfsFileSystem.getUri()); + tmpDir = hdfsTmpPath; + break; + } + } catch (Exception e) { + LOGGER.debug("This file system is not hdfs:// . Skip!", e); + } + } + } + } + return new Path(tmpDir, fileName); + } + + /** + * Check if a given Hadoop path is local. + * If the scheme is null, it will check the default hadoop file system. + * @param path Hadoop path + * @return true if the path is local + */ + protected boolean isLocal(Path path) { + URI uri = path.toUri(); + String scheme = uri.getScheme(); + if (StringUtils.isEmpty(scheme)) { + scheme = FileSystem.getDefaultUri(conf).getScheme(); + } + return "file".equals(scheme); + } + + /** + * Check if a given URI is local. + * If the scheme is null, it assumes it is local. + * @param uri URI + * @return true if the URI is local + */ + public static boolean isLocal(URI uri) { + String scheme = uri.getScheme(); + if (StringUtils.isEmpty(scheme)) { + scheme = "file"; + } + return StringUtils.isEmpty(scheme) || "file".equals(scheme); + } + + public static boolean isHdfs(Path dir, Configuration conf) { + try { + String scheme = dir.toUri().getScheme(); + if (StringUtils.isEmpty(scheme)) { + scheme = FileSystem.getDefaultUri(conf).getScheme(); + return scheme.equals("hdfs"); + } + FileSystem fileSystem = dir.getFileSystem(conf); + return fileSystem.getScheme().equals("hdfs"); + } catch (IOException e) { + LOGGER.error("Error checking if " + dir + " is HDFS : " + e.getMessage()); + return false; + } + } + + public void postExecute(ObjectMap result, boolean succeed) throws IOException { + readKeyValues(result); + postExecute(succeed); + } + + public void postExecute(boolean succeed) throws IOException { + printKeyValue(); + if (succeed) { + if (hasTempOutput()) { + getConcatMrOutputToLocal(); + } + } + if (hasTempOutput()) { + deleteTemporaryFile(outdir); + } + } + + private void readKeyValues(ObjectMap result) { + for (String key : result.keySet()) { + if (key.equals(MapReduceOutputFile.NAMED_OUTPUT)) { + setNamedOutput(result.getString(key)); + } else if (key.startsWith(MapReduceOutputFile.EXTRA_NAMED_OUTPUT_PREFIX)) { + addExtraNamedOutput(key.substring(MapReduceOutputFile.EXTRA_NAMED_OUTPUT_PREFIX.length()), result.getString(key)); + } + } + } + + private void printKeyValue() { + // Print keyValues only if this method is being called from an instance of AbstractHBaseDriver + // Check the stacktrace + boolean found = false; + StackTraceElement[] stackTrace = Thread.currentThread().getStackTrace(); + for (StackTraceElement stackTraceElement : stackTrace) { + try { + Class aClass = Class.forName(stackTraceElement.getClassName()); + if (AbstractHBaseDriver.class.isAssignableFrom(aClass)) { + found = true; + break; + } + } catch (ClassNotFoundException e) { + // This should never happen + throw new RuntimeException(e); + } + } + if (!found) { + return; + } + + if (namedOutput != null) { + AbstractHBaseDriver.printKeyValue(NAMED_OUTPUT, namedOutput); + } + for (Map.Entry entry : extraFiles.entrySet()) { + String suffix = entry.getValue(); + String partFilePrefix = entry.getKey(); + if (hasTempOutput()) { + Path extraOutput = localOutput.suffix(suffix); + AbstractHBaseDriver.printKeyValue(EXTRA_OUTPUT_PREFIX + partFilePrefix, extraOutput); + } else { + AbstractHBaseDriver.printKeyValue(EXTRA_NAMED_OUTPUT_PREFIX + partFilePrefix, suffix); + } + } + } + + public boolean hasTempOutput() { + return localOutput != null; + } + + public MapReduceOutputFile setNamedOutput(String partFilePrefix) { + this.namedOutput = partFilePrefix; + return this; + } + + public void addExtraNamedOutput(String namedOutput, String localOutputPrefix) { + extraFiles.put(namedOutput, localOutputPrefix); + } + + protected void getConcatMrOutputToLocal() throws IOException { + concatMrOutputToLocal(outdir, localOutput, true, namedOutput); + + for (Map.Entry entry : extraFiles.entrySet()) { + String partFilePrefix = entry.getKey(); + String suffix = entry.getValue(); + Path extraOutput = localOutput.suffix(suffix); + concatMrOutputToLocal(outdir, extraOutput, true, partFilePrefix); + AbstractHBaseDriver.printKeyValue(EXTRA_OUTPUT_PREFIX + partFilePrefix.toUpperCase(), extraOutput); + } + } + + /** + * Get the local output file. Might be null if the destination is HDFS. + * @return Local output file + */ + public Path getLocalOutput() { + return localOutput; + } + + /** + * Get the actual output directory for the MapReduce job. + * @return Output directory + */ + public Path getOutdir() { + return outdir; + } + + public Configuration getConf() { + return conf; + } + + private URI toUri(Path path) throws IOException { + URI tmpUri = path.toUri(); + if (tmpUri.getScheme() == null) { + // If the scheme is null, add the default scheme + FileSystem fileSystem = path.getFileSystem(conf); + tmpUri = fileSystem.getUri().resolve(tmpUri.getPath()); + } + return tmpUri; + } + + protected Path getLocalOutput(Path outdir) throws IOException { + if (!isLocal(outdir)) { + throw new IllegalArgumentException("Outdir " + outdir + " is not in the local filesystem"); + } + Path localOutput = outdir; + FileSystem localFs = localOutput.getFileSystem(conf); + if (localFs.exists(localOutput)) { + if (localFs.isDirectory(localOutput)) { + String name = nameGenerator.get(); + if (StringUtils.isEmpty(name)) { + throw new IllegalArgumentException("Local output '" + localOutput + "' is a directory"); + } + localOutput = new Path(localOutput, name); + } else { + throw new IllegalArgumentException("File '" + localOutput + "' already exists!"); + } + } else { + if (!localFs.exists(localOutput.getParent())) { + Files.createDirectories(Paths.get(localOutput.getParent().toUri())); +// throw new IOException("No such file or directory: " + localOutput); + } + } + return localOutput; + } + + protected void deleteTemporaryFile(Path outdir) throws IOException { + LOGGER.info("Delete temporary file " + outdir.toUri()); + FileSystem fileSystem = outdir.getFileSystem(conf); + fileSystem.delete(outdir, true); + fileSystem.cancelDeleteOnExit(outdir); + LOGGER.info("Temporary file deleted!"); + } + + /** + * Concatenate all generated files from a MapReduce job into one single local file. + * + * @param mrOutdir MapReduce output directory + * @param localOutput Local file + * @return List of copied files from HDFS + * @throws IOException on IOException + */ + protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput) throws IOException { + return concatMrOutputToLocal(mrOutdir, localOutput, true, null); + } + + /** + * Concatenate all generated files from a MapReduce job into one single local file. + * + * @param mrOutdir MapReduce output directory + * @param localOutput Local file + * @param removeExtraHeaders Remove header lines starting with "#" from all files but the first + * @param partFilePrefix Filter partial files with specific prefix. Otherwise, concat them all. + * @return List of copied files from HDFS + * @throws IOException on IOException + */ + protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, boolean removeExtraHeaders, String partFilePrefix) + throws IOException { + // TODO: Allow copy output to any IOConnector + FileSystem fileSystem = mrOutdir.getFileSystem(getConf()); + RemoteIterator it = fileSystem.listFiles(mrOutdir, false); + List paths = new ArrayList<>(); + while (it.hasNext()) { + LocatedFileStatus status = it.next(); + Path path = status.getPath(); + if (status.isFile() + && !path.getName().equals(FileOutputCommitter.SUCCEEDED_FILE_NAME) + && !path.getName().equals(FileOutputCommitter.PENDING_DIR_NAME) + && !path.getName().equals(ParquetFileWriter.PARQUET_METADATA_FILE) + && !path.getName().equals(ParquetFileWriter.PARQUET_COMMON_METADATA_FILE) + && status.getLen() > 0) { + if (partFilePrefix == null || path.getName().startsWith(partFilePrefix)) { + paths.add(path); + } + } + } + StopWatch stopWatch = new StopWatch(); + stopWatch.start(); + if (paths.isEmpty()) { + LOGGER.warn("The MapReduce job didn't produce any output. This may not be expected."); + } else if (paths.size() == 1) { + LOGGER.info("Copy to local file"); + LOGGER.info(" Source : {} ({})", + paths.get(0).toUri(), humanReadableByteCount(fileSystem.getFileStatus(paths.get(0)).getLen(), false)); + LOGGER.info(" Target : {}", localOutput.toUri()); + fileSystem.copyToLocalFile(false, paths.get(0), localOutput); + } else { + LOGGER.info("Concat and copy to local : " + paths.size() + " partial files"); + LOGGER.info(" Source {}: {}", getCompression(paths.get(0).getName()), mrOutdir.toUri()); + LOGGER.info(" Target {}: {}", getCompression(localOutput.getName()), localOutput.toUri()); + LOGGER.info(" ---- "); + + try (OutputStream os = getOutputStreamPlain(localOutput.getName(), localOutput.getFileSystem(getConf()).create(localOutput))) { + for (int i = 0; i < paths.size(); i++) { + Path path = paths.get(i); + LOGGER.info("[{}] Concat {} file : '{}' ({}) ", + i, + getCompression(path.getName()), + path.toUri(), + humanReadableByteCount(fileSystem.getFileStatus(path).getLen(), false)); + try (InputStream isAux = getInputStream(path.getName(), fileSystem.open(path))) { + InputStream is = isAux; + // Remove extra headers from all files but the first + if (removeExtraHeaders && i != 0) { + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String line; + do { + br.mark(10 * 1024 * 1024); //10MB + line = br.readLine(); + // Skip blank lines and + } while (line != null && (StringUtils.isBlank(line) || line.startsWith("#"))); + br.reset(); + is = new ReaderInputStream(br, Charset.defaultCharset()); + } + + IOUtils.copyBytes(is, os, getConf(), false); + } + } + } + LOGGER.info("File size : " + humanReadableByteCount(Files.size(Paths.get(localOutput.toUri())), false)); + LOGGER.info("Time to copy from HDFS and concat : " + TimeUtils.durationToString(stopWatch)); + } + return paths; + } + + private static String getCompression(String name) throws IOException { + if (name.endsWith(".gz")) { + return "gzip"; + } else if (name.endsWith(".snappy")) { + return "snappy"; + } else if (name.endsWith(".lz4")) { + return "lz4"; + } else if (name.endsWith(".zst")) { + return "ztandard"; + } else { + return "plain"; + } + } + + private OutputStream getOutputStreamPlain(String name, OutputStream os) throws IOException { + CompressionCodec codec = getCompressionCodec(name); + if (codec == null) { + return os; + } + try { + return codec.createOutputStream(os); + } catch (UnsatisfiedLinkError error) { + if (codec instanceof SnappyCodec) { + return new SnappyOutputStream(os); + } else { + throw error; + } + } + } + + private CompressionCodec getCompressionCodec(String name) throws IOException { + return getCompressionCodec(getCompression(name), getConf()); + } + + public static CompressionCodec getCompressionCodec(String codecName, Configuration conf) throws IOException { + Class codecClass; + switch (codecName) { + case "deflate": + codecClass = DeflateCodec.class; + break; + case "gz": + case "gzip": + codecClass = GzipCodec.class; + break; + case "snappy": + codecClass = SnappyCodec.class; + break; + case "lz4": + codecClass = Lz4Codec.class; + break; + case "ztandard": + codecClass = ZStandardCodec.class; + break; + case "bz": + codecClass = BZip2Codec.class; + break; + case "plain": + return null; + default: + throw new IOException("Unknown compression codec " + codecName); + } + return ReflectionUtils.newInstance(codecClass, conf); + } + + private InputStream getInputStream(String name, InputStream is) throws IOException { + CompressionCodec codec = getCompressionCodec(name); + if (codec == null) { + return is; + } + try { + return codec.createInputStream(is); + } catch (UnsatisfiedLinkError error) { + if (codec instanceof SnappyCodec) { + return new SnappyInputStream(is); + } else { + throw error; + } + } + } +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/AbstractVariantsTableDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/AbstractVariantsTableDriver.java index 356df234d34..b72c4d9d45f 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/AbstractVariantsTableDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/AbstractVariantsTableDriver.java @@ -33,6 +33,7 @@ import org.opencb.opencga.storage.core.variant.VariantStorageOptions; import org.opencb.opencga.storage.hadoop.utils.AbstractHBaseDriver; import org.opencb.opencga.storage.hadoop.utils.HBaseManager; +import org.opencb.opencga.storage.hadoop.utils.MapReduceOutputFile; import org.opencb.opencga.storage.hadoop.variant.archive.ArchiveTableHelper; import org.opencb.opencga.storage.hadoop.variant.gaps.FillMissingFromArchiveTask; import org.opencb.opencga.storage.hadoop.variant.metadata.HBaseVariantStorageMetadataDBAdaptorFactory; @@ -44,6 +45,7 @@ import java.io.IOException; import java.io.UncheckedIOException; import java.util.*; +import java.util.function.Supplier; import java.util.stream.Collectors; import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine.FILE_ID; @@ -267,6 +269,27 @@ protected String getArchiveTable() { return getConf().get(ArchiveTableHelper.CONFIG_ARCHIVE_TABLE_NAME, StringUtils.EMPTY); } + protected MapReduceOutputFile initMapReduceOutputFile() throws IOException { + return initMapReduceOutputFile(null); + } + + protected MapReduceOutputFile initMapReduceOutputFile(Supplier nameGenerator) throws IOException { + return initMapReduceOutputFile(nameGenerator, false); + } + + protected MapReduceOutputFile initMapReduceOutputFile(Supplier nameGenerator, boolean optional) throws IOException { + String output = getParam(OUTPUT_PARAM); + if (StringUtils.isEmpty(output)) { + if (optional) { + return null; + } else { + throw new IllegalArgumentException("Expected param " + OUTPUT_PARAM); + } + } + return new MapReduceOutputFile(output, nameGenerator, + getTableNameGenerator().getDbName() + "_" + getClass().getSimpleName(), getConf()); + } + protected HBaseVariantTableNameGenerator getTableNameGenerator() { String dbName = HBaseVariantTableNameGenerator.getDBNameFromVariantsTableName(getVariantsTable()); return new HBaseVariantTableNameGenerator(dbName, getConf()); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java index 023dbbaeec0..3affa2e3b95 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java @@ -1309,7 +1309,7 @@ private Configuration getHadoopConfiguration(ObjectMap options) { public MRExecutor getMRExecutor() throws StorageEngineException { if (mrExecutor == null) { - mrExecutor = MRExecutorFactory.getMRExecutor(getOptions()); + mrExecutor = MRExecutorFactory.getMRExecutor(getDBName(), getOptions(), getConf()); } return mrExecutor; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/VariantTableAggregationDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/VariantTableAggregationDriver.java index 0d471c8387d..32da542bfe2 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/VariantTableAggregationDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/VariantTableAggregationDriver.java @@ -1,9 +1,8 @@ package org.opencb.opencga.storage.hadoop.variant; -import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.opencb.commons.datastore.core.Query; @@ -11,6 +10,7 @@ import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; import org.opencb.opencga.storage.core.variant.VariantStorageOptions; +import org.opencb.opencga.storage.hadoop.utils.MapReduceOutputFile; import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil; import org.opencb.opencga.storage.hadoop.variant.mr.VariantRowMapper; import org.slf4j.Logger; @@ -28,10 +28,8 @@ public abstract class VariantTableAggregationDriver extends AbstractVariantsTableDriver { private static final Logger LOGGER = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - public static final String OUTPUT = "output"; - - protected Path outdir; - protected Path localOutput; + public static final String OUTPUT = OUTPUT_PARAM; + protected MapReduceOutputFile output; @Override @@ -51,16 +49,7 @@ protected void parseAndValidateParameters() throws IOException { throw new IllegalArgumentException("Missing study"); } - String outdirStr = getParam(OUTPUT); - if (StringUtils.isNotEmpty(outdirStr)) { - outdir = new Path(outdirStr); - - if (isLocal(outdir)) { - localOutput = getLocalOutput(outdir, this::generateOutputFileName); - outdir = getTempOutdir("opencga_sample_variant_stats", localOutput.getName()); - outdir.getFileSystem(getConf()).deleteOnExit(outdir); - } - } + output = initMapReduceOutputFile(null, true); } @@ -118,17 +107,11 @@ protected Job setupJob(Job job, String archiveTable, String variantTable) throws job.setOutputKeyClass(getOutputKeyClass()); job.setOutputValueClass(getOutputValueClass()); - if (outdir == null) { + if (output == null) { job.setOutputFormatClass(NullOutputFormat.class); } else { job.setOutputFormatClass(TextOutputFormat.class); - TextOutputFormat.setOutputPath(job, outdir); - if (localOutput == null) { - LOGGER.info("Output directory : " + outdir); - } else { - LOGGER.info("Temporary output directory : " + outdir); - LOGGER.info("Local output file : " + localOutput); - } + FileOutputFormat.setOutputPath(job, output.getOutdir()); // set Path } int numReduceTasks = getNumReduceTasks(); @@ -142,13 +125,8 @@ protected Job setupJob(Job job, String archiveTable, String variantTable) throws @Override protected void postExecution(boolean succeed) throws IOException, StorageEngineException { super.postExecution(succeed); - if (succeed) { - if (localOutput != null) { - concatMrOutputToLocal(outdir, localOutput, isOutputWithHeaders(), null); - } - } - if (localOutput != null) { - deleteTemporaryFile(outdir); + if (output != null) { + output.postExecute(succeed); } } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/gwas/FisherTestDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/gwas/FisherTestDriver.java index 0e11259da3a..14398d925a7 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/gwas/FisherTestDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/gwas/FisherTestDriver.java @@ -3,7 +3,6 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; @@ -16,7 +15,6 @@ import org.opencb.biodata.models.variant.stats.VariantStats; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; -import org.opencb.opencga.core.common.TimeUtils; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; import org.opencb.opencga.storage.core.metadata.models.ProjectMetadata; @@ -25,6 +23,7 @@ import org.opencb.opencga.storage.core.variant.adaptors.VariantField; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryException; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; +import org.opencb.opencga.storage.hadoop.utils.MapReduceOutputFile; import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver; import org.opencb.opencga.storage.hadoop.variant.converters.VariantRow; import org.opencb.opencga.storage.hadoop.variant.converters.annotation.HBaseToVariantAnnotationConverter; @@ -50,7 +49,7 @@ public class FisherTestDriver extends AbstractVariantsTableDriver { private final Logger logger = LoggerFactory.getLogger(FisherTestDriver.class); // Output directory within DFS - public static final String OUTPUT = "output"; + public static final String OUTPUT = OUTPUT_PARAM; // // Move to local directory (remove from DFS) // public static final String MOVE_TO_LOCAL = "move-to-local"; public static final String CASE_COHORT = "caseCohort"; @@ -64,8 +63,7 @@ public class FisherTestDriver extends AbstractVariantsTableDriver { private Integer controlCohortId; private List caseCohort; private List controlCohort; - private Path outdir; - private Path localOutput; + private MapReduceOutputFile output; private Query query; private QueryOptions queryOptions; @@ -138,25 +136,10 @@ protected void parseAndValidateParameters() throws IOException { VariantField.STUDIES_SECONDARY_ALTERNATES, VariantField.STUDIES_STATS)); - String outdirStr = getConf().get(OUTPUT); - if (StringUtils.isEmpty(outdirStr)) { - outdir = new Path("fisher." + TimeUtils.getTime() + ".tsv"); - } else { - outdir = new Path(outdirStr); - if (isLocal(outdir)) { - localOutput = getLocalOutput(outdir, () -> "fisher_test." + TimeUtils.getTime() + ".tsv.gz"); - outdir = getTempOutdir("opencga_fisher_test", "." + localOutput.getName()); - outdir.getFileSystem(getConf()).deleteOnExit(outdir); - } - if (localOutput != null) { - logger.info(" * Outdir file: " + localOutput.toUri()); - logger.info(" * Temporary outdir file: " + outdir.toUri()); - } else { - logger.info(" * Outdir file: " + outdir.toUri()); - } - } + output = initMapReduceOutputFile(); } + private Pair> parseCohort(String cohortStr, String cohortDescription) throws IOException { VariantStorageMetadataManager metadataManager = getMetadataManager(); int studyId = getStudyId(); @@ -202,11 +185,11 @@ protected Job setupJob(Job job, String archiveTable, String variantTable) throws job.getConfiguration().set(CONTROL_COHORT_IDS, controlCohort.stream().map(Objects::toString).collect(Collectors.joining(","))); job.setOutputFormatClass(TextOutputFormat.class); - if (outdir.toString().toLowerCase().endsWith(".gz")) { + if (output.getOutdir().toString().toLowerCase().endsWith(".gz")) { TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } - TextOutputFormat.setOutputPath(job, outdir); + TextOutputFormat.setOutputPath(job, output.getOutdir()); job.setReducerClass(FisherTestReducer.class); job.setMapOutputKeyClass(NullWritable.class); @@ -227,14 +210,7 @@ protected String getJobOperationName() { @Override protected void postExecution(boolean succeed) throws IOException, StorageEngineException { super.postExecution(succeed); - if (succeed) { - if (localOutput != null) { - concatMrOutputToLocal(outdir, localOutput); - } - } - if (localOutput != null) { - deleteTemporaryFile(outdir); - } + output.postExecute(succeed); } public static class FisherTestMapper extends VariantRowMapper { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutor.java index f8c85a813f6..fd0c805d1dd 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutor.java @@ -18,6 +18,7 @@ import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.StopWatch; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.Tool; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.opencga.core.common.GitRepositoryState; @@ -46,6 +47,8 @@ public abstract class MRExecutor { public static final String HADOOP_LIB_VERSION_PROPERTIES = "org/opencb/opencga/storage/hadoop/lib/version.properties"; + protected String dbName; + protected Configuration conf; private ObjectMap options; private List env; private static Logger logger = LoggerFactory.getLogger(MRExecutor.class); @@ -74,8 +77,10 @@ public String getErrorMessage() { } } - public MRExecutor init(ObjectMap options) { + public MRExecutor init(String dbName, Configuration conf, ObjectMap options) { + this.dbName = dbName; this.options = options; + this.conf = conf; env = options.getAsStringList(MR_HADOOP_ENV.key()); return this; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutorFactory.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutorFactory.java index 29666831779..a51e4d8cde8 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutorFactory.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutorFactory.java @@ -1,5 +1,6 @@ package org.opencb.opencga.storage.hadoop.variant.executors; +import org.apache.hadoop.conf.Configuration; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; @@ -15,33 +16,29 @@ public final class MRExecutorFactory { private MRExecutorFactory() { } - public static MRExecutor getMRExecutor(ObjectMap options) throws StorageEngineException { + public static MRExecutor getMRExecutor(String dbName, ObjectMap options, Configuration conf) throws StorageEngineException { MRExecutor mrExecutor; - Class aClass; String executor = options.getString(MR_EXECUTOR.key(), MR_EXECUTOR.defaultValue()); switch (executor.toLowerCase()) { case "system": - aClass = SystemMRExecutor.class; + mrExecutor = new SystemMRExecutor(); break; case "ssh": - aClass = SshMRExecutor.class; + mrExecutor = new SshMRExecutor(); break; default: try { + Class aClass; aClass = Class.forName(executor).asSubclass(MRExecutor.class); - } catch (ClassNotFoundException | ClassCastException e) { + mrExecutor = aClass.newInstance(); + } catch (InstantiationException | IllegalAccessException | ClassNotFoundException | ClassCastException e) { throw new StorageEngineException("Error creating MRExecutor '" + executor + "'", e); } break; } - try { - mrExecutor = aClass.newInstance(); - } catch (InstantiationException | IllegalAccessException e) { - throw new StorageEngineException("Error creating MRExecutor '" + executor + "'", e); - } // configure MRExecutor - mrExecutor.init(options); + mrExecutor.init(dbName, conf, options); return mrExecutor; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java index faea9185887..e410e21265d 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java @@ -1,6 +1,7 @@ package org.opencb.opencga.storage.hadoop.variant.executors; import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.util.RunJar; import org.apache.tools.ant.types.Commandline; import org.opencb.commons.datastore.core.ObjectMap; @@ -8,10 +9,13 @@ import org.opencb.opencga.core.common.UriUtils; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.hadoop.utils.AbstractHBaseDriver; +import org.opencb.opencga.storage.hadoop.utils.MapReduceOutputFile; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.net.URI; import java.nio.charset.Charset; import java.nio.file.Path; import java.nio.file.Paths; @@ -35,17 +39,17 @@ public class SshMRExecutor extends MRExecutor { // env-var expected by "sshpass -e" private static final String SSHPASS_ENV = "SSHPASS"; public static final String PID = "PID"; - public static final String EXTRA_OUTPUT_PREFIX = "EXTRA_OUTPUT_"; private static Logger logger = LoggerFactory.getLogger(SshMRExecutor.class); @Override - public SshMRExecutor init(ObjectMap options) { - super.init(options); + public SshMRExecutor init(String dbName, Configuration conf, ObjectMap options) { + super.init(dbName, conf, options); return this; } @Override public Result run(String executable, String[] args) throws StorageEngineException { + MapReduceOutputFile mrOutput = initMrOutput(executable, args); String commandLine = buildCommand(executable, args); List env = buildEnv(); @@ -105,17 +109,79 @@ public Result run(String executable, String[] args) throws StorageEngineExceptio int exitValue = command.getExitValue(); Runtime.getRuntime().removeShutdownHook(hook); ObjectMap result = readResult(new String(outputStream.toByteArray(), Charset.defaultCharset())); - if (exitValue == 0) { - copyOutputFiles(args, env); - for (String key : result.keySet()) { - if (key.startsWith(EXTRA_OUTPUT_PREFIX)) { - copyOutputFiles(result.getString(key), env); + boolean succeed = exitValue == 0; + if (mrOutput != null) { + try { + mrOutput.postExecute(result, succeed); + } catch (IOException e) { + throw new StorageEngineException(e.getMessage(), e); + } + } + try { + if (succeed) { + if (mrOutput != null) { + mrOutput.postExecute(result, succeed); + } else { + copyOutputFiles(args, env); } + // Copy extra output files + for (String key : result.keySet()) { + if (key.startsWith(MapReduceOutputFile.EXTRA_OUTPUT_PREFIX)) { + copyOutputFiles(result.getString(key), env); + } + } + } else { + if (mrOutput != null) { + mrOutput.postExecute(result, succeed); + } // else // should delete remote output files? } + } catch (IOException e) { + throw new StorageEngineException(e.getMessage(), e); } return new Result(exitValue, result); } + /** + * If the MapReduce to be executed is writing to a local filesystem, change the output to a temporary HDFS path. + * The output will be copied to the local filesystem after the execution. + *

+ * This method will look for the ${@link AbstractHBaseDriver#OUTPUT_PARAM} argument in the args array. + * + * @param executable Executable + * @param args Arguments passed to the executable. Might be modified + * @return MapReduceOutputFile if any + * @throws StorageEngineException if there is an issue creating the temporary output path + */ + private MapReduceOutputFile initMrOutput(String executable, String[] args) throws StorageEngineException { + MapReduceOutputFile mrOutput = null; + List argsList = Arrays.asList(args); + int outputIdx = argsList.indexOf(AbstractHBaseDriver.OUTPUT_PARAM); + if (outputIdx > 0 && argsList.size() > outputIdx + 1) { + String output = argsList.get(outputIdx + 1); + URI outputUri = UriUtils.createUriSafe(output); + if (MapReduceOutputFile.isLocal(outputUri)) { + try { + int i = executable.lastIndexOf('.'); + String tempFilePrefix; + if (i > 0) { + String className = executable.substring(i); + tempFilePrefix = dbName + "_" + className; + } else { + tempFilePrefix = dbName; + } + mrOutput = new MapReduceOutputFile(outputUri.toString(), null, + tempFilePrefix, true, conf); + } catch (IOException e) { + throw new StorageEngineException(e.getMessage(), e); + } + logger.info("Change output from file:// to hdfs://. Using MapReduceOutputFile: " + mrOutput.getOutdir()); + // Replace output path with the temporary path + argsList.set(outputIdx + 1, mrOutput.getOutdir().toString()); + } + } + return mrOutput; + } + /** * Copy output files from remote server to local filesystem. *

@@ -129,7 +195,7 @@ public Result run(String executable, String[] args) throws StorageEngineExceptio */ private Path copyOutputFiles(String[] args, List env) throws StorageEngineException { List argsList = Arrays.asList(args); - int outputIdx = argsList.indexOf("output"); + int outputIdx = argsList.indexOf(AbstractHBaseDriver.OUTPUT_PARAM); if (outputIdx > 0 && argsList.size() > outputIdx + 1) { return copyOutputFiles(argsList.get(outputIdx + 1), env); } @@ -138,7 +204,12 @@ private Path copyOutputFiles(String[] args, List env) throws StorageEngi } private Path copyOutputFiles(String output, List env) throws StorageEngineException { - String targetOutput = UriUtils.createUriSafe(output).getPath(); + URI targetOutputUri = UriUtils.createUriSafe(output); + if (MapReduceOutputFile.isLocal(targetOutputUri)) { + logger.info("Output is not a file:// URI. Skipping copy file {}", targetOutputUri); + return null; + } + String targetOutput = targetOutputUri.getPath(); if (StringUtils.isNotEmpty(targetOutput)) { String remoteOpencgaHome = getOptions().getString(MR_EXECUTOR_SSH_REMOTE_OPENCGA_HOME.key()); String srcOutput; diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java index 53511b84739..f0dd9ed7875 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporter.java @@ -75,7 +75,7 @@ public HadoopVariantExporter(HadoopVariantStorageEngine engine, VariantMetadataF public List export(@Nullable URI outputFileUri, VariantWriterFactory.VariantOutputFormat outputFormat, URI variantsFile, ParsedVariantQuery variantQuery) throws IOException, StorageEngineException { - VariantHadoopDBAdaptor dbAdaptor = ((VariantHadoopDBAdaptor) engine.getDBAdaptor()); + VariantHadoopDBAdaptor dbAdaptor = engine.getDBAdaptor(); IOConnector ioConnector = ioConnectorProvider.get(outputFileUri); // Use pre-processed query instead of input query @@ -199,7 +199,8 @@ public List export(@Nullable URI outputFileUri, VariantWriterFactory.Varian || (variantsFile != null) || smallQuery || queryOptions.getBoolean("skipMapReduce", false) - || (!(ioConnector instanceof HDFSIOConnector) && !(ioConnector instanceof LocalIOConnector))) { + // Mapreduce can only use HDFS or Local IOConnectors. When using other IOConnectors, skip mapreduce + || !(ioConnector instanceof HDFSIOConnector || ioConnector instanceof LocalIOConnector)) { return super.export(outputFileUri, outputFormat, variantsFile, variantQuery); } else { outputFileUri = VariantWriterFactory.checkOutput(outputFileUri, outputFormat); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java index 8916c5242aa..2658435b809 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantDriver.java @@ -11,6 +11,7 @@ import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; +import org.opencb.opencga.storage.hadoop.utils.MapReduceOutputFile; import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver; import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapReduceUtil; import org.opencb.opencga.storage.hadoop.variant.mr.VariantMapper; @@ -38,8 +39,6 @@ */ public abstract class VariantDriver extends AbstractVariantsTableDriver { - public static final String OUTPUT_PARAM = "output"; - public static final String CONCAT_OUTPUT_PARAM = "concat-output"; protected MapReduceOutputFile output; private final Query query = new Query(); private final QueryOptions options = new QueryOptions(); @@ -52,7 +51,7 @@ protected void parseAndValidateParameters() throws IOException { super.parseAndValidateParameters(); // useReduceStep = Boolean.valueOf(getParam(CONCAT_OUTPUT_PARAM)); - output = new MapReduceOutputFile(getTableNameGenerator().getDbName() + "_" + getClass().getSimpleName()); + output = initMapReduceOutputFile(); getQueryFromConfig(query, getConf()); getQueryOptionsFromConfig(options, getConf()); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java index 93a75006fb4..eea7f69d5b4 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java @@ -142,7 +142,9 @@ protected void setupJob(Job job) throws IOException { } if (SnappyCodec.isNativeCodeLoaded()) { FileOutputFormat.setCompressOutput(job, true); - FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); + // FIXME: SnappyCodec might not be available in client side +// FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); + FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java index 91ac57391dc..960196f6f1d 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantDriver.java @@ -166,7 +166,9 @@ protected void setupJob(Job job) throws IOException { job.setOutputFormatClass(ValueOnlyTextOutputFormat.class); if (SnappyCodec.isNativeCodeLoaded()) { FileOutputFormat.setCompressOutput(job, true); - FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); + // FIXME: SnappyCodec might not be available in client side +// FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); + FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); } else { FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java index 77786498ed5..46b059e05c7 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java @@ -222,7 +222,9 @@ public static void initVariantMapperJobFromHBase(Job job, String variantTableNam job.setInputFormatClass(HBaseVariantTableInputFormat.class); job.getConfiguration().setBoolean(HBaseVariantTableInputFormat.MULTI_SCANS, scans.size() > 1); job.getConfiguration().setBoolean(HBaseVariantTableInputFormat.USE_SAMPLE_INDEX_TABLE_INPUT_FORMAT, useSampleIndex); - job.getConfiguration().set(HBaseVariantTableInputFormat.SAMPLE_INDEX_TABLE, sampleIndexTable); + if (sampleIndexTable != null) { + job.getConfiguration().set(HBaseVariantTableInputFormat.SAMPLE_INDEX_TABLE, sampleIndexTable); + } } public static void initVariantMapperJobFromPhoenix(Job job, VariantHadoopDBAdaptor dbAdaptor, diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/prune/VariantPruneDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/prune/VariantPruneDriver.java index fddaa7c189e..239331c0aca 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/prune/VariantPruneDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/prune/VariantPruneDriver.java @@ -21,6 +21,7 @@ import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; import org.opencb.opencga.storage.core.metadata.models.VariantScoreMetadata; +import org.opencb.opencga.storage.hadoop.utils.MapReduceOutputFile; import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver; import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.PhoenixHelper; import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantPhoenixSchema; @@ -60,7 +61,7 @@ protected Class getMapperClass() { @Override protected String getJobOperationName() { - return "vairants-prune"; + return "variants-prune"; } @Override @@ -82,9 +83,7 @@ protected void parseAndValidateParameters() throws IOException { params.updateParams(new HashMap<>(Collections.singletonMap(key, value))); } } - output = new MapReduceOutputFile( - () -> "variant_prune_report." + TimeUtils.getTime() + ".txt", - "variant_prune_report"); + output = initMapReduceOutputFile(() -> "variant_prune_report." + TimeUtils.getTime() + ".txt"); } @Override diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/CohortVariantStatsDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/CohortVariantStatsDriver.java index e9dfa8ee28e..b633508455f 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/CohortVariantStatsDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/CohortVariantStatsDriver.java @@ -63,9 +63,6 @@ protected void parseAndValidateParameters() throws IOException { String samples = getParam(SAMPLES); String cohort = getParam(COHORT); - if (outdir == null) { - throw new IllegalArgumentException("Expected param " + OUTPUT); - } if (samples == null && cohort == null) { throw new IllegalArgumentException("Expected param " + SAMPLES + " or " + COHORT); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java index 33f4e07675c..7ff4049afce 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java @@ -252,7 +252,7 @@ protected Job setupJob(Job job, String archiveTable, String variantTable) throws } job.getConfiguration().setInt(STUDY_ID, getStudyId()); job.getConfiguration().set(TRIOS, trios); - if (outdir != null) { + if (output != null) { job.getConfiguration().setBoolean(WRITE_TO_DISK, true); } return job; diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/VariantStatsDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/VariantStatsDriver.java index c1dc34e2e21..20d17205311 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/VariantStatsDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/VariantStatsDriver.java @@ -19,6 +19,7 @@ import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.core.variant.query.VariantQueryUtils; import org.opencb.opencga.storage.core.variant.stats.VariantStatisticsManager; +import org.opencb.opencga.storage.hadoop.utils.MapReduceOutputFile; import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver; import org.opencb.opencga.storage.hadoop.variant.GenomeHelper; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHBaseQueryParser; @@ -101,9 +102,9 @@ protected void parseAndValidateParameters() throws IOException { logger.info(" * " + VariantStorageOptions.STATS_DEFAULT_GENOTYPE.key() + ": " + statsDefaultGenotype); - output = new MapReduceOutputFile(() -> "variant_stats." + output = initMapReduceOutputFile(() -> "variant_stats." + (cohorts.size() < 10 ? "." + String.join("_", cohortNames) : "") - + TimeUtils.getTime() + ".json", "opencga_sample_variant_stats"); + + TimeUtils.getTime() + ".json", true); } @Override @@ -129,7 +130,7 @@ protected Job setupJob(Job job, String archiveTableName, String variantTableName query.put(VariantQueryParam.INCLUDE_FILE.key(), VariantQueryUtils.NONE); } - if (output.getOutdir() != null) { + if (output != null) { // Do not index stats. // Allow any input query. // Write stats to file. @@ -212,7 +213,9 @@ protected Job setupJob(Job job, String archiveTableName, String variantTableName @Override protected void postExecution(boolean succeed) throws IOException, StorageEngineException { super.postExecution(succeed); - output.postExecute(succeed); + if (output != null) { + output.postExecute(succeed); + } } @Override diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutorTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutorTest.java index 99f109eff15..5d3018deee9 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutorTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutorTest.java @@ -1,5 +1,6 @@ package org.opencb.opencga.storage.hadoop.variant.executors; +import org.apache.hadoop.conf.Configuration; import org.junit.Before; import org.junit.Test; import org.junit.experimental.categories.Category; @@ -36,14 +37,14 @@ public void setUp() throws Exception { @Test public void testFactory() throws StorageEngineException { - MRExecutor mrExecutor = MRExecutorFactory.getMRExecutor(options); + MRExecutor mrExecutor = MRExecutorFactory.getMRExecutor("", options, new Configuration()); assertThat(mrExecutor, instanceOf(SshMRExecutor.class)); } @Test public void testRun() throws StorageEngineException { SshMRExecutor sshMRExecutor = new SshMRExecutor(); - sshMRExecutor.init(options); + sshMRExecutor.init("", new Configuration(), options); String cmd = sshMRExecutor.buildCommand("echo", "hello world", HadoopVariantStorageOptions.MR_EXECUTOR_SSH_PASSWORD.key(), "password"); assertEquals("/opt/opencga/misc/scripts/hadoop-ssh.sh echo \"hello world\" " + HadoopVariantStorageOptions.MR_EXECUTOR_SSH_PASSWORD.key() + " _redacted_", cmd); @@ -57,7 +58,7 @@ public void testRun() throws StorageEngineException { @Test public void testChangeRemoteOpenCGAHome() throws StorageEngineException { SshMRExecutor sshMRExecutor = new SshMRExecutor(); - sshMRExecutor.init(options.append(HadoopVariantStorageOptions.MR_EXECUTOR_SSH_REMOTE_OPENCGA_HOME.key(), "/home/user/opencga")); + sshMRExecutor.init("", new Configuration(), options.append(HadoopVariantStorageOptions.MR_EXECUTOR_SSH_REMOTE_OPENCGA_HOME.key(), "/home/user/opencga")); String hadoopClasspath = "/opt/opencga/libs/myLib.jar::/opt/opencga/libs/myLibOther.jar:/opt/opencga/conf/hadoop"; String expectedHadoopClasspath = "/home/user/opencga/libs/myLib.jar:/home/user/opencga/libs/myLibOther.jar:/home/user/opencga/conf/hadoop"; diff --git a/pom.xml b/pom.xml index e073746344d..840bbb15ec5 100644 --- a/pom.xml +++ b/pom.xml @@ -113,7 +113,7 @@ 2.2.0 2.1.0 1.0.0 - 1.1.8.2 + 1.1.10.4 ${parquet-common.version} 5.0 From cd50a3c483deace1e2f45bdec88ae1028862c4b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Mon, 25 Nov 2024 13:53:39 +0000 Subject: [PATCH 37/66] storage: Improve MapReduceOutputFile concatMrOutputToLocal. #TASK-6722 --- .../opencb/opencga/core/common/IOUtils.java | 115 ++++++++++++++++++ .../opencga/core/common/IOUtilsTest.java | 19 ++- .../hadoop/utils/MapReduceOutputFile.java | 43 +++++-- 3 files changed, 163 insertions(+), 14 deletions(-) diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java b/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java index eb0cdeaf29e..bfc9f3524ba 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java @@ -16,12 +16,21 @@ package org.opencb.opencga.core.common; +import org.opencb.commons.run.ParallelTaskRunner; + import java.io.*; +import java.nio.ByteBuffer; import java.nio.charset.Charset; import java.nio.file.*; import java.nio.file.attribute.BasicFileAttributes; import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedList; import java.util.List; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.atomic.AtomicReference; import java.util.regex.Pattern; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; @@ -407,4 +416,110 @@ public static long fromHumanReadableToByte(String value, boolean assumeBinary) { } return (long) (Double.parseDouble(value) * Math.pow(unit, exp)); } + + public static void copyBytesParallel(InputStream is, OutputStream os) throws IOException { + copyBytesParallel(is, os, 4096); + } + public static void copyBytesParallel(InputStream is, OutputStream os, int bufferSize) throws IOException { + List buffersPool = Collections.synchronizedList(new LinkedList<>()); + ArrayBlockingQueue buffersQueue = new ArrayBlockingQueue<>(5); + AtomicReference exception = new AtomicReference<>(); + + Thread readerThread = new Thread(() -> { + try { + while (true) { + // Take a buffer from the pool or create a new one + ByteBuffer buf = buffersPool.isEmpty() ? ByteBuffer.allocate(bufferSize) : buffersPool.remove(0); + int bytesRead = is.read(buf.array()); + if (bytesRead == -1) { + buffersQueue.put(ByteBuffer.allocate(0)); // Signal end of stream + break; + } + buf.limit(bytesRead); + buffersQueue.put(buf); + } + } catch (Exception e) { + if (!exception.compareAndSet(null, e)) { + exception.get().addSuppressed(e); + } + } + }); + + Thread writerThread = new Thread(() -> { + try { + while (true) { + ByteBuffer buf = buffersQueue.take(); + if (buf.limit() == 0) { + break; // End of stream signal + } + os.write(buf.array(), 0, buf.limit()); + buf.clear(); + // Return the buffer to the pool + buffersPool.add(buf); + } + } catch (Exception e) { + if (!exception.compareAndSet(null, e)) { + exception.get().addSuppressed(e); + } + } + }); + + readerThread.start(); + writerThread.start(); + + try { + readerThread.join(); + writerThread.join(); + } catch (InterruptedException e) { + throw new IOException(e); + } + + if (exception.get() != null) { + throw new IOException(exception.get()); + } + } + + public static void copyBytesParallel2(InputStream is, OutputStream os, int bufferSize) throws IOException { + + List buffersPool = Collections.synchronizedList(new LinkedList<>()); + ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder() + .setNumTasks(1) + .setCapacity(5) + .setSorted(true) + .build(); + ParallelTaskRunner runner = new ParallelTaskRunner<>(batchSize -> { + try { + ByteBuffer buf = buffersPool.isEmpty() ? ByteBuffer.allocate(bufferSize) : buffersPool.remove(0); + int bytesRead = is.read(buf.array()); + if (bytesRead > 0) { + if (bytesRead != buf.array().length) { + buf.limit(bytesRead); + buf.rewind(); + } + return Collections.singletonList(buf); + } else { + return Collections.emptyList(); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + }, t -> t, batch -> { + try { + for (ByteBuffer buf : batch) { + os.write(buf.array(), 0, buf.limit()); + // Return the buffer to the pool + buf.clear(); + buffersPool.add(buf); + } + } catch (IOException e1) { + throw new UncheckedIOException(e1); + } + return true; + }, config); + try { + runner.run(); + } catch (ExecutionException e) { + throw new IOException(e); + } + } } diff --git a/opencga-core/src/test/java/org/opencb/opencga/core/common/IOUtilsTest.java b/opencga-core/src/test/java/org/opencb/opencga/core/common/IOUtilsTest.java index f8d85da1acf..9da0d25a40a 100644 --- a/opencga-core/src/test/java/org/opencb/opencga/core/common/IOUtilsTest.java +++ b/opencga-core/src/test/java/org/opencb/opencga/core/common/IOUtilsTest.java @@ -16,14 +16,14 @@ package org.opencb.opencga.core.common; +import org.junit.Assert; import org.junit.Test; import org.junit.experimental.categories.Category; import org.opencb.opencga.core.testclassification.duration.ShortTests; -import java.io.BufferedReader; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.io.*; import java.nio.file.Paths; +import java.util.Random; @Category(ShortTests.class) public class IOUtilsTest { @@ -68,4 +68,17 @@ public void testGrepFile() throws Exception { in.close(); } + + @Test + public void copyBytesHandlesBufferSizeSmallerThanInput() throws Exception { +// byte[] inputData = "Hello, World!".getBytes(); + byte[] inputData = new byte[10 * 1024 * 1024 + 5]; // 10 MB + new Random().nextBytes(inputData); + InputStream is = new ByteArrayInputStream(inputData); + ByteArrayOutputStream os = new ByteArrayOutputStream(); + + IOUtils.copyBytesParallel(is, os, 4096); + + Assert.assertArrayEquals(inputData, os.toByteArray()); + } } \ No newline at end of file diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java index e91be76f97e..0510f38f464 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java @@ -5,10 +5,7 @@ import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; -import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.fs.*; import org.apache.hadoop.io.compress.*; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.hadoop.util.ReflectionUtils; @@ -365,14 +362,17 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool try (OutputStream os = getOutputStreamPlain(localOutput.getName(), localOutput.getFileSystem(getConf()).create(localOutput))) { for (int i = 0; i < paths.size(); i++) { - Path path = paths.get(i); + Path partFile = paths.get(i); + long partFileSize = fileSystem.getFileStatus(partFile).getLen(); LOGGER.info("[{}] Concat {} file : '{}' ({}) ", i, - getCompression(path.getName()), - path.toUri(), - humanReadableByteCount(fileSystem.getFileStatus(path).getLen(), false)); - try (InputStream isAux = getInputStream(path.getName(), fileSystem.open(path))) { - InputStream is = isAux; + getCompression(partFile.getName()), + partFile.toUri(), + humanReadableByteCount(partFileSize, false)); + InputStream is = null; + Throwable e = null; + try { + is = getInputStream(partFile.getName(), fileSystem.open(partFile)); // Remove extra headers from all files but the first if (removeExtraHeaders && i != 0) { BufferedReader br = new BufferedReader(new InputStreamReader(is)); @@ -386,7 +386,28 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool is = new ReaderInputStream(br, Charset.defaultCharset()); } - IOUtils.copyBytes(is, os, getConf(), false); + if (partFileSize > 50 * 1024 * 1024) { + org.opencb.opencga.core.common.IOUtils.copyBytesParallel(is, os, getConf().getInt( + CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_KEY, + CommonConfigurationKeysPublic.IO_FILE_BUFFER_SIZE_DEFAULT)); + } else { + org.apache.hadoop.io.IOUtils.copyBytes(is, os, getConf(), false); + } + } catch (Throwable throwable) { + e = throwable; + throw throwable; + } finally { + if (is != null) { + try { + is.close(); + } catch (IOException ex) { + if (e == null) { + throw ex; + } else { + e.addSuppressed(ex); + } + } + } } } } From d430391dfcf7afea27d228dc2135fb25274bf1e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Mon, 25 Nov 2024 17:28:28 +0000 Subject: [PATCH 38/66] storage: Increase mapreduce.task.timeout to 30min #TASK-6722 --- .../src/main/resources/storage-configuration.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml b/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml index dfa6865eb40..c80d7ffb92a 100644 --- a/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml +++ b/opencga-storage/opencga-storage-core/src/main/resources/storage-configuration.yml @@ -187,6 +187,7 @@ variant: # DOCKER_HOST environment variable to be used by the docker executor inside the MapReduce job storage.hadoop.mr.stream.docker.host: "" + mapreduce.task.timeout: 1800000 mapreduce.map.memory.mb: 2048 DeleteHBaseColumnDriver: storage.hadoop.write.mappers.limit.factor: 4 From e35ee834c3f6653a69bc3f7a263ee342022086e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Mon, 25 Nov 2024 17:58:55 +0000 Subject: [PATCH 39/66] storage: Fix temporary mapreduce outdir. #TASK-6722 --- .../opencga/storage/hadoop/variant/executors/SshMRExecutor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java index e410e21265d..a4c974394b1 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java @@ -165,7 +165,7 @@ private MapReduceOutputFile initMrOutput(String executable, String[] args) throw String tempFilePrefix; if (i > 0) { String className = executable.substring(i); - tempFilePrefix = dbName + "_" + className; + tempFilePrefix = dbName + className; } else { tempFilePrefix = dbName; } From 0c486033762de4229fc13f7dc73ff6788eaa8912 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 26 Nov 2024 08:30:32 +0000 Subject: [PATCH 40/66] storage: Do not double copy hdfs files #TASK-6722 --- .../storage/hadoop/variant/executors/SshMRExecutor.java | 7 ------- 1 file changed, 7 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java index a4c974394b1..af90a7144ff 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java @@ -110,13 +110,6 @@ public Result run(String executable, String[] args) throws StorageEngineExceptio Runtime.getRuntime().removeShutdownHook(hook); ObjectMap result = readResult(new String(outputStream.toByteArray(), Charset.defaultCharset())); boolean succeed = exitValue == 0; - if (mrOutput != null) { - try { - mrOutput.postExecute(result, succeed); - } catch (IOException e) { - throw new StorageEngineException(e.getMessage(), e); - } - } try { if (succeed) { if (mrOutput != null) { From ccf7438be171c55c7130398eb8d53c6e3fcd60be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 26 Nov 2024 11:41:19 +0000 Subject: [PATCH 41/66] storage: Use reducer to concat binary files #TASK-6722 --- .../hadoop/utils/MapReduceOutputFile.java | 2 +- .../variant/executors/MRExecutorFactory.java | 5 ++ .../variant/executors/SshMRExecutor.java | 18 ++++--- .../variant/io/VariantExporterDriver.java | 4 ++ .../variant/HadoopVariantStorageTest.java | 35 ++++++++++-- .../HadoopVariantAnnotationManagerTest.java | 2 +- .../variant/index/sample/SampleIndexTest.java | 8 +-- .../variant/io/HadoopVariantExporterTest.java | 53 ++++++++++++++++--- 8 files changed, 103 insertions(+), 24 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java index 0510f38f464..e80a2d07da6 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java @@ -133,7 +133,7 @@ public static boolean isLocal(URI uri) { if (StringUtils.isEmpty(scheme)) { scheme = "file"; } - return StringUtils.isEmpty(scheme) || "file".equals(scheme); + return "file".equals(scheme); } public static boolean isHdfs(Path dir, Configuration conf) { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutorFactory.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutorFactory.java index a51e4d8cde8..335bbeb14ac 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutorFactory.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutorFactory.java @@ -3,6 +3,8 @@ import org.apache.hadoop.conf.Configuration; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageOptions.MR_EXECUTOR; @@ -13,6 +15,8 @@ */ public final class MRExecutorFactory { + private static Logger logger = LoggerFactory.getLogger(SshMRExecutor.class); + private MRExecutorFactory() { } @@ -28,6 +32,7 @@ public static MRExecutor getMRExecutor(String dbName, ObjectMap options, Configu break; default: try { + logger.info("Creating new instance of MRExecutor '{}'", executor); Class aClass; aClass = Class.forName(executor).asSubclass(MRExecutor.class); mrExecutor = aClass.newInstance(); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java index af90a7144ff..57a391a24d7 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java @@ -50,7 +50,6 @@ public SshMRExecutor init(String dbName, Configuration conf, ObjectMap options) @Override public Result run(String executable, String[] args) throws StorageEngineException { MapReduceOutputFile mrOutput = initMrOutput(executable, args); - String commandLine = buildCommand(executable, args); List env = buildEnv(); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); @@ -103,13 +102,10 @@ public Result run(String executable, String[] args) throws StorageEngineExceptio } }); Runtime.getRuntime().addShutdownHook(hook); - Command command = new Command(commandLine, env); - command.setErrorOutputStream(outputStream); - command.run(); - int exitValue = command.getExitValue(); + int exitValue = runRemote(executable, args, env, outputStream); + boolean succeed = exitValue == 0; Runtime.getRuntime().removeShutdownHook(hook); ObjectMap result = readResult(new String(outputStream.toByteArray(), Charset.defaultCharset())); - boolean succeed = exitValue == 0; try { if (succeed) { if (mrOutput != null) { @@ -134,6 +130,14 @@ public Result run(String executable, String[] args) throws StorageEngineExceptio return new Result(exitValue, result); } + protected int runRemote(String executable, String[] args, List env, ByteArrayOutputStream outputStream) { + String commandLine = buildCommand(executable, args); + Command command = new Command(commandLine, env); + command.setErrorOutputStream(outputStream); + command.run(); + return command.getExitValue(); + } + /** * If the MapReduce to be executed is writing to a local filesystem, change the output to a temporary HDFS path. * The output will be copied to the local filesystem after the execution. @@ -198,7 +202,7 @@ private Path copyOutputFiles(String[] args, List env) throws StorageEngi private Path copyOutputFiles(String output, List env) throws StorageEngineException { URI targetOutputUri = UriUtils.createUriSafe(output); - if (MapReduceOutputFile.isLocal(targetOutputUri)) { + if (!MapReduceOutputFile.isLocal(targetOutputUri)) { logger.info("Output is not a file:// URI. Skipping copy file {}", targetOutputUri); return null; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java index eea7f69d5b4..a26dd84d6b8 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/io/VariantExporterDriver.java @@ -51,6 +51,10 @@ protected void parseAndValidateParameters() throws IOException { super.parseAndValidateParameters(); outputFormat = VariantWriterFactory.VariantOutputFormat.valueOf(getParam(OUTPUT_FORMAT_PARAM, "avro").toUpperCase()); + if (outputFormat.isBinary()) { + // Binary outputs should be concatenated in a reduce step + useReduceStep = true; + } } @Override diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageTest.java index 4ddc5d03a0a..faa05ef9887 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageTest.java @@ -90,14 +90,17 @@ import org.opencb.opencga.storage.hadoop.utils.HBaseManager; import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantPhoenixSchemaManager; import org.opencb.opencga.storage.hadoop.variant.executors.MRExecutor; +import org.opencb.opencga.storage.hadoop.variant.executors.SshMRExecutor; import org.opencb.opencga.storage.hadoop.variant.index.IndexUtils; import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexSchema; import org.opencb.opencga.storage.hadoop.variant.utils.HBaseVariantTableNameGenerator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.PrintStream; import java.lang.reflect.Method; import java.util.*; import java.util.concurrent.atomic.AtomicReference; @@ -394,13 +397,13 @@ static HadoopVariantStorageEngine getHadoopVariantStorageEngine(Map o } engine.setConfiguration(storageConfiguration, HadoopVariantStorageEngine.STORAGE_ENGINE_ID, VariantStorageBaseTest.DB_NAME); - engine.mrExecutor = new TestMRExecutor(configuration.get()); + engine.mrExecutor = null; engine.conf = conf; return engine; } - default TestMRExecutor getMrExecutor() { - return new TestMRExecutor(configuration.get()); + default MRExecutor getMrExecutor() throws StorageEngineException { + return HadoopVariantStorageTest.manager.get().getMRExecutor(); } static StorageConfiguration getStorageConfiguration(Configuration conf) throws IOException { @@ -416,7 +419,8 @@ static StorageConfiguration updateStorageConfiguration(StorageConfiguration stor StorageEngineConfiguration variantConfiguration = storageConfiguration.getVariantEngine(HadoopVariantStorageEngine.STORAGE_ENGINE_ID); ObjectMap options = variantConfiguration.getOptions(); - options.put(HadoopVariantStorageOptions.MR_EXECUTOR.key(), TestMRExecutor.class.getName()); + options.put(HadoopVariantStorageOptions.MR_JAR_WITH_DEPENDENCIES.key(), "dummy-test-jar-with-depepdencies.jar"); + options.put(HadoopVariantStorageOptions.MR_EXECUTOR.key(), TestSshMrExecutor.class.getName()); TestMRExecutor.setStaticConfiguration(conf); options.put(HadoopVariantStorageOptions.MR_ADD_DEPENDENCY_JARS.key(), false); @@ -517,6 +521,29 @@ default int getExpectedNumLoadedVariants(VariantFileMetadata fileMetadata) { return numRecords; } + class TestSshMrExecutor extends SshMRExecutor { + private final Configuration configuration; + + public TestSshMrExecutor() { + this.configuration = new Configuration(TestMRExecutor.staticConfiguration); + } + + @Override + protected int runRemote(String executable, String[] args, List env, ByteArrayOutputStream outputStream) { + PrintStream out = System.out; + try { + return new TestMRExecutor(conf).run(executable, args).getExitValue(); + } finally { + System.setOut(out); + } + } + + @Override + protected List buildEnv() { + return new LinkedList<>(); + } + } + class TestMRExecutor extends MRExecutor { private static Configuration staticConfiguration; diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/annotation/HadoopVariantAnnotationManagerTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/annotation/HadoopVariantAnnotationManagerTest.java index 253a24fc773..fa09dad8798 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/annotation/HadoopVariantAnnotationManagerTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/annotation/HadoopVariantAnnotationManagerTest.java @@ -48,7 +48,7 @@ public void incrementalAnnotationTest() throws Exception { .append(VariantStorageOptions.STATS_CALCULATE.key(), false)); // Update pending variants - new TestMRExecutor().run(DiscoverPendingVariantsDriver.class, + getMrExecutor().run(DiscoverPendingVariantsDriver.class, DiscoverPendingVariantsDriver.buildArgs(engine.getDBAdaptor().getVariantTable(), AnnotationPendingVariantsDescriptor.class, new ObjectMap()), "Prepare variants to annotate"); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java index 98062c27b8f..2e64c65229c 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java @@ -324,13 +324,13 @@ public void regenerateSampleIndex() throws Exception { .append(SampleIndexDriver.SAMPLE_INDEX_VERSION, version) .append(SampleIndexDriver.OUTPUT, copy) .append(SampleIndexDriver.SAMPLES, "all"); - new TestMRExecutor().run(SampleIndexDriver.class, SampleIndexDriver.buildArgs( + getMrExecutor().run(SampleIndexDriver.class, SampleIndexDriver.buildArgs( dbAdaptor.getArchiveTableName(studyId), dbAdaptor.getVariantTable(), studyId, Collections.emptySet(), options), ""); - new TestMRExecutor().run(SampleIndexAnnotationLoaderDriver.class, SampleIndexAnnotationLoaderDriver.buildArgs( + getMrExecutor().run(SampleIndexAnnotationLoaderDriver.class, SampleIndexAnnotationLoaderDriver.buildArgs( dbAdaptor.getArchiveTableName(studyId), dbAdaptor.getVariantTable(), studyId, @@ -339,7 +339,7 @@ public void regenerateSampleIndex() throws Exception { if (sampleNames.get(study).containsAll(trios.get(0).toList())) { options.put(FamilyIndexDriver.TRIOS, trios.stream().map(Trio::serialize).collect(Collectors.joining(";"))); options.put(FamilyIndexDriver.OVERWRITE, true); - new TestMRExecutor().run(FamilyIndexDriver.class, FamilyIndexDriver.buildArgs( + getMrExecutor().run(FamilyIndexDriver.class, FamilyIndexDriver.buildArgs( dbAdaptor.getArchiveTableName(studyId), dbAdaptor.getVariantTable(), studyId, @@ -347,7 +347,7 @@ public void regenerateSampleIndex() throws Exception { } else if (study.equals(STUDY_NAME_3)) { options.put(FamilyIndexDriver.TRIOS, triosPlatinum.stream().map(Trio::serialize).collect(Collectors.joining(";"))); options.put(FamilyIndexDriver.OVERWRITE, true); - new TestMRExecutor().run(FamilyIndexDriver.class, FamilyIndexDriver.buildArgs( + getMrExecutor().run(FamilyIndexDriver.class, FamilyIndexDriver.buildArgs( dbAdaptor.getArchiveTableName(studyId), dbAdaptor.getVariantTable(), studyId, diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporterTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporterTest.java index 32f0151d676..d7873151820 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporterTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/io/HadoopVariantExporterTest.java @@ -1,11 +1,17 @@ package org.opencb.opencga.storage.hadoop.variant.io; +import com.fasterxml.jackson.databind.MapperFeature; +import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.*; import org.junit.experimental.categories.Category; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; +import org.opencb.biodata.models.metadata.Individual; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.metadata.VariantMetadata; +import org.opencb.biodata.models.variant.metadata.VariantStudyMetadata; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; @@ -17,6 +23,7 @@ import org.opencb.opencga.storage.core.variant.adaptors.VariantQuery; import org.opencb.opencga.storage.core.variant.io.VariantExporter; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; +import org.opencb.opencga.storage.core.variant.io.avro.VariantAvroReader; import org.opencb.opencga.storage.core.variant.solr.VariantSolrExternalResource; import org.opencb.opencga.storage.hadoop.HBaseCompat; import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine; @@ -24,8 +31,14 @@ import org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils; import java.io.IOException; +import java.io.InputStream; import java.net.URI; import java.nio.file.Paths; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; import static org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam.*; @@ -140,9 +153,28 @@ public void exportMultiRegion() throws Exception { public void exportAvroGz() throws Exception { String fileName = "variants.avro_gz"; URI uri = getOutputUri(fileName); - uri = variantStorageEngine.exportData(uri, VariantWriterFactory.VariantOutputFormat.AVRO_GZ, null, new Query(STUDY.key(), study1), new QueryOptions()).get(0); + List uris = variantStorageEngine.exportData(uri, VariantWriterFactory.VariantOutputFormat.AVRO_GZ, null, new Query(STUDY.key(), study1), new QueryOptions()); - copyToLocal(uri); + URI outputUri = copyToLocal(uris.get(0)); + if (exportToLocal) { + URI metaUri = copyToLocal(uris.get(1)); + + ObjectMapper objectMapper = new ObjectMapper().configure(MapperFeature.REQUIRE_SETTERS_FOR_GETTERS, true); + VariantMetadata metadata; + try (InputStream is = ioConnectorProvider.newInputStream(metaUri)) { + metadata = objectMapper.readValue(is, VariantMetadata.class); + } + + Map> samplesPositions = new HashMap<>(); + for (VariantStudyMetadata study : metadata.getStudies()) { + LinkedHashMap samples = samplesPositions.put(study.getId(), new LinkedHashMap<>()); + for (Individual individual : study.getIndividuals()) { + samples.put(individual.getId(), samples.size()); + } + } + List variants = new VariantAvroReader(Paths.get(outputUri).toFile(), samplesPositions).stream().collect(Collectors.toList()); + System.out.println("variants.size() = " + variants.size()); + } } @Test @@ -158,6 +190,7 @@ public void exportVcf() throws Exception { public void exportVcfGz() throws Exception { String fileName = "variants.vcf.gz"; URI uri = getOutputUri(fileName); + System.out.println("variantStorageEngine.getMRExecutor() = " + variantStorageEngine.getMRExecutor()); variantStorageEngine.exportData(uri, VariantWriterFactory.VariantOutputFormat.VCF_GZ, null, new Query(STUDY.key(), study1), new QueryOptions()); copyToLocal(fileName, uri); @@ -281,11 +314,11 @@ public void exportWithGenes() throws Exception { copyToLocal(fileName, uri); } - protected void copyToLocal(URI uri) throws IOException { - copyToLocal(Paths.get(uri.getPath()).getFileName().toString(), uri); + protected URI copyToLocal(URI uri) throws IOException { + return copyToLocal(Paths.get(uri.getPath()).getFileName().toString(), uri); } - protected void copyToLocal(String fileName, URI uri) throws IOException { + protected URI copyToLocal(String fileName, URI uri) throws IOException { if (!exportToLocal) { System.out.println("Copy file " + uri); FileSystem.get(externalResource.getConf()).copyToLocalFile(true, @@ -293,14 +326,20 @@ protected void copyToLocal(String fileName, URI uri) throws IOException { new Path(outputUri.resolve(fileName))); if (fileName.endsWith(VariantExporter.TPED_FILE_EXTENSION)) { + Path dst = new Path(outputUri.resolve(fileName.replace(VariantExporter.TPED_FILE_EXTENSION, VariantExporter.TFAM_FILE_EXTENSION))); FileSystem.get(externalResource.getConf()).copyToLocalFile(true, new Path(uri.toString().replace(VariantExporter.TPED_FILE_EXTENSION, VariantExporter.TFAM_FILE_EXTENSION)), - new Path(outputUri.resolve(fileName.replace(VariantExporter.TPED_FILE_EXTENSION, VariantExporter.TFAM_FILE_EXTENSION)))); + dst); + return dst.toUri(); } else { + Path dst = new Path(outputUri.resolve(fileName + VariantExporter.METADATA_FILE_EXTENSION)); FileSystem.get(externalResource.getConf()).copyToLocalFile(true, new Path(uri.toString() + VariantExporter.METADATA_FILE_EXTENSION), - new Path(outputUri.resolve(fileName + VariantExporter.METADATA_FILE_EXTENSION))); + dst); + return dst.toUri(); } + } else { + return uri; } } From f87686e43cb6914d34d93f9e534509471ccf0090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 27 Nov 2024 17:15:12 +0000 Subject: [PATCH 42/66] storage: Do not fail vairant-walker if no output is produced. #TASK-6722 --- .../analysis/variant/VariantWalkerTool.java | 12 ++++++----- .../variant/HadoopVariantStorageEngine.java | 20 ++++++++++++++++++- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java index 68ad63d3549..a37ca18486f 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java @@ -87,11 +87,13 @@ protected void run() throws Exception { }); step("move-files", () -> { // Move files to final directory - IOManager ioManager = catalogManager.getIoManagerFactory().get(uris.get(0)); - for (URI uri : uris) { - String fileName = UriUtils.fileName(uri); - logger.info("Moving file -- " + fileName); - ioManager.move(uri, getOutDir().resolve(fileName).toUri()); + if (!uris.isEmpty()) { + IOManager ioManager = catalogManager.getIoManagerFactory().get(uris.get(0)); + for (URI uri : uris) { + String fileName = UriUtils.fileName(uri); + logger.info("Moving file -- " + fileName); + ioManager.move(uri, getOutDir().resolve(fileName).toUri()); + } } }); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java index 3affa2e3b95..c5d924e7698 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java @@ -44,6 +44,7 @@ import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.exceptions.StoragePipelineException; import org.opencb.opencga.storage.core.exceptions.VariantSearchException; +import org.opencb.opencga.storage.core.io.managers.IOConnector; import org.opencb.opencga.storage.core.io.managers.IOConnectorProvider; import org.opencb.opencga.storage.core.metadata.VariantMetadataFactory; import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; @@ -342,7 +343,24 @@ public List walkData(URI outputFile, VariantWriterFactory.VariantOutputForm .append(StreamVariantDriver.INPUT_FORMAT_PARAM, format.toString()) .append(StreamVariantDriver.OUTPUT_PARAM, outputFile) ), "Walk data"); - return Arrays.asList(outputFile, UriUtils.createUriSafe(outputFile.toString() + StreamVariantDriver.STDERR_TXT_GZ)); + List uris = new ArrayList<>(); + URI stderrFile = UriUtils.createUriSafe(outputFile.toString() + StreamVariantDriver.STDERR_TXT_GZ); + try { + IOConnector ioConnector = ioConnectorProvider.get(outputFile); + if (ioConnector.exists(outputFile)) { + uris.add(outputFile); + } else { + logger.warn("Output file not found: {}", outputFile); + } + if (ioConnector.exists(stderrFile)) { + uris.add(stderrFile); + } else { + logger.warn("Stderr file not found: {}", stderrFile); + } + } catch (IOException e) { + throw new StorageEngineException("Error checking output file", e); + } + return uris; } @Override From a389e10115d75e9a023cebf63ac28e5e1313bf16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 27 Nov 2024 17:16:52 +0000 Subject: [PATCH 43/66] storage: Split PhoenixInputSplits into smaller splits. #TASK-6722 --- .../variant/HadoopVariantStorageOptions.java | 1 + .../variant/mr/CustomPhoenixInputFormat.java | 52 +++++++++++++++++-- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java index 268caaf9253..37331233aad 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java @@ -42,6 +42,7 @@ public enum HadoopVariantStorageOptions implements ConfigurationOption { MR_HBASE_SCAN_CACHING("storage.hadoop.mr.scan.caching", 50), MR_HBASE_SCAN_MAX_COLUMNS("storage.hadoop.mr.scan.maxColumns", 25000), MR_HBASE_SCAN_MAX_FILTERS("storage.hadoop.mr.scan.maxFilters", 2000), + MR_HBASE_PHOENIX_SCAN_SPLIT("storage.hadoop.mr.phoenix.scanSplit", 5), /** * MapReduce executor. Could be either 'system' or 'ssh'. diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/CustomPhoenixInputFormat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/CustomPhoenixInputFormat.java index 5b280facb0e..b8e34933c95 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/CustomPhoenixInputFormat.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/CustomPhoenixInputFormat.java @@ -7,6 +7,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.db.DBWritable; @@ -21,11 +22,16 @@ import org.apache.phoenix.query.QueryServices; import org.apache.phoenix.util.PhoenixRuntime; import org.opencb.opencga.storage.hadoop.HBaseCompat; +import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageOptions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.Closeable; import java.io.IOException; import java.sql.Connection; import java.sql.Statement; +import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Properties; @@ -39,6 +45,7 @@ */ public class CustomPhoenixInputFormat extends InputFormat { private static final Log LOG = LogFactory.getLog(CustomPhoenixInputFormat.class); + private static Logger logger = LoggerFactory.getLogger(CustomPhoenixInputFormat.class); @Override public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) @@ -58,6 +65,20 @@ public CloseValueRecordReader(RecordReader recordReader) { super(recordReader, v -> v); } + @Override + public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { + super.initialize(split, context); + if (split instanceof PhoenixInputSplit) { + PhoenixInputSplit phoenixInputSplit = (PhoenixInputSplit) split; + logger.info("Key range : " + phoenixInputSplit.getKeyRange()); + logger.info("Split: " + phoenixInputSplit.getScans().size() + " scans"); + int i = 0; + for (Scan scan : phoenixInputSplit.getScans()) { + logger.info("[{}] Scan: {}", ++i, scan); + } + } + } + @Override public void close() throws IOException { V currentValue; @@ -78,16 +99,41 @@ public List getSplits(JobContext context) throws IOException, Interr final Configuration configuration = context.getConfiguration(); final QueryPlan queryPlan = getQueryPlan(context, configuration); final List allSplits = queryPlan.getSplits(); - final List splits = generateSplits(queryPlan, allSplits); + final List splits = generateSplits(queryPlan, allSplits, configuration); return splits; } - private List generateSplits(final QueryPlan qplan, final List splits) throws IOException { + private List generateSplits(final QueryPlan qplan, final List splits, Configuration configuration) + throws IOException { Preconditions.checkNotNull(qplan); Preconditions.checkNotNull(splits); final List psplits = Lists.newArrayListWithExpectedSize(splits.size()); for (List scans : qplan.getScans()) { - psplits.add(new PhoenixInputSplit(scans)); + if (scans.size() == 1) { + // Split scans into multiple smaller scans + int numScans = configuration.getInt(HadoopVariantStorageOptions.MR_HBASE_PHOENIX_SCAN_SPLIT.key(), + HadoopVariantStorageOptions.MR_HBASE_PHOENIX_SCAN_SPLIT.defaultValue()); + List splitScans = new ArrayList<>(numScans); + Scan scan = scans.get(0); + byte[] startRow = scan.getStartRow(); + byte[] stopRow = scan.getStopRow(); + if (startRow != null && startRow.length != 0 && stopRow != null && stopRow.length != 0) { + byte[][] ranges = Bytes.split(startRow, stopRow, numScans - 1); + for (int i = 1; i < ranges.length; i++) { + Scan splitScan = new Scan(scan); + splitScan.withStartRow(ranges[i - 1]); + splitScan.withStopRow(ranges[i], false); + splitScans.add(splitScan); + } + } else { + splitScans.add(scan); + } + for (Scan splitScan : splitScans) { + psplits.add(new PhoenixInputSplit(Collections.singletonList(splitScan))); + } + } else { + psplits.add(new PhoenixInputSplit(scans)); + } } return psplits; } From f4530908d26dca9e4acd14baa7d50a3ef9766205 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 27 Nov 2024 17:32:49 +0000 Subject: [PATCH 44/66] storage: Improve log message. #TASK-6722 --- .../storage/hadoop/variant/executors/SshMRExecutor.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java index 57a391a24d7..cd61a522a94 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/SshMRExecutor.java @@ -157,6 +157,8 @@ private MapReduceOutputFile initMrOutput(String executable, String[] args) throw String output = argsList.get(outputIdx + 1); URI outputUri = UriUtils.createUriSafe(output); if (MapReduceOutputFile.isLocal(outputUri)) { + logger.info("This MapReduce will produce some output. Change output location from file:// to a temporary hdfs:// file" + + " so it can be copied to the local filesystem after the execution"); try { int i = executable.lastIndexOf('.'); String tempFilePrefix; @@ -171,7 +173,6 @@ private MapReduceOutputFile initMrOutput(String executable, String[] args) throw } catch (IOException e) { throw new StorageEngineException(e.getMessage(), e); } - logger.info("Change output from file:// to hdfs://. Using MapReduceOutputFile: " + mrOutput.getOutdir()); // Replace output path with the temporary path argsList.set(outputIdx + 1, mrOutput.getOutdir().toString()); } From 47535c1a2ee74e3f810434507c894adca0c5b0cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 28 Nov 2024 10:38:27 +0000 Subject: [PATCH 45/66] storage: Add HadoopVariantWalkerTest. #TASK-6722 --- .../analysis/variant/VariantWalkerTool.java | 3 + .../variant/HadoopVariantStorageEngine.java | 7 +- .../variant/executors/MRExecutorFactory.java | 2 +- .../variant/mr/StreamVariantMapper.java | 2 +- .../src/main/python/requirements.txt | 0 .../VariantHadoopStoragePipelineTest.java | 49 ------- .../walker/HadoopVariantWalkerTest.java | 128 ++++++++++++++++++ .../test/resources/variantWalker/Dockerfile | 17 +++ 8 files changed, 156 insertions(+), 52 deletions(-) create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/requirements.txt create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/walker/HadoopVariantWalkerTest.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/variantWalker/Dockerfile diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java index a37ca18486f..0dd6f10c344 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java @@ -54,6 +54,9 @@ protected void check() throws Exception { } format = VariantWriterFactory.toOutputFormat(toolParams.getFileFormat(), toolParams.getOutputFileName()); + if (format.isBinary()) { + throw new IllegalArgumentException("Binary format not supported for VariantWalkerTool"); + } if (!format.isPlain()) { format = format.inPlain(); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java index c5d924e7698..137ee955663 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java @@ -321,7 +321,12 @@ protected VariantExporter newVariantExporter(VariantMetadataFactory metadataFact public List walkData(URI outputFile, VariantWriterFactory.VariantOutputFormat format, Query query, QueryOptions queryOptions, String commandLine) throws StorageEngineException { ParsedVariantQuery variantQuery = parseQuery(query, queryOptions); - int studyId = variantQuery.getStudyQuery().getDefaultStudy().getId(); + int studyId; + if (variantQuery.getStudyQuery().getDefaultStudy() == null) { + studyId = -1; + } else { + studyId = variantQuery.getStudyQuery().getDefaultStudy().getId(); + } ObjectMap params = new ObjectMap(getOptions()).appendAll(variantQuery.getQuery()).appendAll(variantQuery.getInputOptions()); params.remove(StreamVariantDriver.COMMAND_LINE_PARAM); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutorFactory.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutorFactory.java index 335bbeb14ac..8560847e139 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutorFactory.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/executors/MRExecutorFactory.java @@ -15,7 +15,7 @@ */ public final class MRExecutorFactory { - private static Logger logger = LoggerFactory.getLogger(SshMRExecutor.class); + private static Logger logger = LoggerFactory.getLogger(MRExecutorFactory.class); private MRExecutorFactory() { } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 5132d49a7b0..95d0e0fb8cd 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -278,7 +278,7 @@ private void dockerPruneImages() { LOG.info("Pruning docker images"); int maxImages = 5; Command command = new Command(new String[]{"bash", "-c", "[ $(docker image ls --format json | wc -l) -gt " + maxImages + " ] " - + "&& echo 'Run docker image prune' && docker image prune -f -a " + + "&& echo 'Run docker image prune' && docker image prune -f --all --filter label!=storage='do_not_delete'" + "|| echo 'Skipping docker image prune. Less than " + maxImages + " images.'"}, Collections.emptyMap()); command.run(); int ecode = command.getExitValue(); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/requirements.txt b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/requirements.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHadoopStoragePipelineTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHadoopStoragePipelineTest.java index ac01326b1cc..773f017a67c 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHadoopStoragePipelineTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/VariantHadoopStoragePipelineTest.java @@ -40,7 +40,6 @@ import org.opencb.opencga.storage.core.variant.VariantStorageOptions; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.core.variant.adaptors.iterators.VariantDBIterator; -import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.hadoop.utils.HBaseManager; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantPhoenixKeyFactory; @@ -285,52 +284,4 @@ public void printVariants() throws Exception { VariantHbaseTestUtils.printVariants(studyMetadata, dbAdaptor, outDir); } - - @Test - public void exportCommand() throws Exception { - URI outdir = newOutputUri(); - List cmdList = Arrays.asList( - "export NUM_VARIANTS=0 ;", - "function setup() {", - " echo \"#SETUP\" ;", - " echo '## Something in single quotes' ; ", - "} ;", - "function map() {", -// " echo \"[$NUM_VARIANTS] $1\" 1>&2 ;", - " echo \"[$NUM_VARIANTS] \" 1>&2 ;", - " echo \"$1\" | jq .id ;", - " NUM_VARIANTS=$((NUM_VARIANTS+1)) ;", - "};", - "function cleanup() {", - " echo \"CLEANUP\" ;", - " echo \"NumVariants = $NUM_VARIANTS\" ;", - "};", - "setup;", - "while read -r i ; do ", - " map \"$i\" ; ", - "done; ", - "cleanup;"); - - // TODO: Add docker prune - - // String cmd = "bash -c '" + String.join("\n", cmdList) + "'"; - String cmd = String.join("\n", cmdList); - String cmdBash = "bash -ce '" + cmd.replace("'", "'\"'\"'") + "'"; - String cmdDocker = "docker run --rm -i opencb/opencga-base bash -ce '" + cmd.replace("'", "'\"'\"'") + "'"; - String cmdPython1 = "python variant_walker.py walker_example Cut --length 30"; -// String cmdPython2 = "python /home/jacobo/appl/opencga/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/python/* opencga-storage-hadoop-walker-example MyWalker --length 30"; - - -// variantStorageEngine.walkData(outdir.resolve("variant3.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmdDocker); -// variantStorageEngine.walkData(outdir.resolve("variant2.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmdBash); -// variantStorageEngine.walkData(outdir.resolve("variant1.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmd); -// variantStorageEngine.walkData(outdir.resolve("variant5.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmdPython1); -// variantStorageEngine.walkData(outdir.resolve("variant8.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmdPython2); -// variantStorageEngine.walkData(outdir.resolve("variant6.txt.gz"), VariantWriterFactory.VariantOutputFormat.VCF, new Query(), new QueryOptions(), cmdPython); -// variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), "opencb/opencga-base", cmd); -// variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), "opencb/opencga-base", cmdPython1); - variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), "jcoll/my-python-app:latest", cmdPython1); - - } - } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/walker/HadoopVariantWalkerTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/walker/HadoopVariantWalkerTest.java new file mode 100644 index 00000000000..02606a01fff --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/walker/HadoopVariantWalkerTest.java @@ -0,0 +1,128 @@ +package org.opencb.opencga.storage.hadoop.variant.walker; + + +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.commons.datastore.core.Query; +import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.commons.exec.Command; +import org.opencb.opencga.core.testclassification.duration.LongTests; +import org.opencb.opencga.storage.core.metadata.models.StudyMetadata; +import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest; +import org.opencb.opencga.storage.core.variant.VariantStorageOptions; +import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; +import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine; +import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageTest; +import org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import static org.junit.Assert.assertEquals; + +@Category(LongTests.class) +public class HadoopVariantWalkerTest extends VariantStorageBaseTest implements HadoopVariantStorageTest { + + @ClassRule + public static HadoopExternalResource externalResource = new HadoopExternalResource(); + private static String _dockerImage; + + @Before + public void before() throws Exception { + // Do not clear DB for each test + } + + @BeforeClass + public static void beforeClass() throws Exception { + HadoopVariantStorageEngine variantStorageManager = externalResource.getVariantStorageEngine(); + externalResource.clearDB(variantStorageManager.getDBName()); + +// URI inputUri = VariantStorageBaseTest.getResourceUri("sample1.genome.vcf"); +// URI inputUri = VariantStorageBaseTest.getResourceUri("platinum/1K.end.platinum-genomes-vcf-NA12877_S1.genome.vcf.gz"); + URI inputUri = VariantStorageBaseTest.getResourceUri("variant-test-file.vcf.gz"); + + StudyMetadata studyMetadata = VariantStorageBaseTest.newStudyMetadata(); + VariantStorageBaseTest.runDefaultETL(inputUri, variantStorageManager, studyMetadata, + new ObjectMap(VariantStorageOptions.TRANSFORM_FORMAT.key(), "avro") + .append(VariantStorageOptions.ANNOTATE.key(), true) + .append(VariantStorageOptions.STATS_CALCULATE.key(), false) + ); + + VariantHbaseTestUtils.printVariants(variantStorageManager.getDBAdaptor(), newOutputUri()); + } + + @Test + public void exportCommand() throws Exception { + URI outdir = newOutputUri(); + + List cmdList = Arrays.asList( + "export NUM_VARIANTS=0 ;", + "function setup() {", + " echo \"#SETUP\" ;", + " echo '## Something in single quotes' ; ", + "} ;", + "function map() {", +// " echo \"[$NUM_VARIANTS] $1\" 1>&2 ;", + " echo \"[$NUM_VARIANTS] \" 1>&2 ;", + " echo \"$1\" | jq .id ;", + " NUM_VARIANTS=$((NUM_VARIANTS+1)) ;", + "};", + "function cleanup() {", + " echo \"CLEANUP\" ;", + " echo \"NumVariants = $NUM_VARIANTS\" ;", + "};", + "setup;", + "while read -r i ; do ", + " map \"$i\" ; ", + "done; ", + "cleanup;"); + + // String cmd = "bash -c '" + String.join("\n", cmdList) + "'"; + String cmd = String.join("\n", cmdList); + +// variantStorageEngine.walkData(outdir.resolve("variant3.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmdDocker); +// variantStorageEngine.walkData(outdir.resolve("variant2.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmdBash); + variantStorageEngine.walkData(outdir.resolve("variant1.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmd); +// variantStorageEngine.walkData(outdir.resolve("variant5.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmdPython1); +// variantStorageEngine.walkData(outdir.resolve("variant8.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), cmdPython2); +// variantStorageEngine.walkData(outdir.resolve("variant6.txt.gz"), VariantWriterFactory.VariantOutputFormat.VCF, new Query(), new QueryOptions(), cmdPython); +// variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), "opencb/opencga-base", cmd); +// variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), "opencb/opencga-base", cmdPython1); + } + + @Test + public void exportDocker() throws Exception { + URI outdir = newOutputUri(); + String dockerImage = buildDocker(); + + String cmdPython1 = "python variant_walker.py walker_example Cut --length 30"; + + variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), dockerImage, cmdPython1); + } + + private static String buildDocker() throws IOException { + if (HadoopVariantWalkerTest._dockerImage != null) { + return HadoopVariantWalkerTest._dockerImage; + } + String dockerImage = "local/variant-walker-test:latest"; + Path dockerFile = Paths.get(getResourceUri("variantWalker/Dockerfile").getPath()); +// Path pythonDir = Paths.get("../../opencga-storage-core/src/main/python").toAbsolutePath(); + Path pythonDir = Paths.get("src/main/python").toAbsolutePath(); + Command dockerBuild = new Command(new String[]{"docker", "build", "-t", dockerImage, "-f", dockerFile.toString(), pythonDir.toString()}, Collections.emptyMap()); + dockerBuild.run(); + assertEquals(0, dockerBuild.getExitValue()); + HadoopVariantWalkerTest._dockerImage = dockerImage; + return dockerImage; + } + + +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/variantWalker/Dockerfile b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/variantWalker/Dockerfile new file mode 100644 index 00000000000..bd9f5511adf --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/variantWalker/Dockerfile @@ -0,0 +1,17 @@ +# Use an official Python runtime as a parent image +FROM python:3.8-slim-buster + +# Set the working directory in the container to /app +WORKDIR /app + +ARG PYTHON_PATH="." + +LABEL storage="do_not_delete" + +RUN echo ${PYTHON_PATH} +# Copy the python directory contents into the container at /app +COPY ${PYTHON_PATH} /app + +# Install any needed packages specified in requirements.txt +RUN ls -la /app +RUN pip install --no-cache-dir -r requirements.txt From 003e467f57a8dbc8660f0735f214fd2766b198b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 28 Nov 2024 14:33:53 +0000 Subject: [PATCH 46/66] storage: Rename some variant-walker params. Add descriptions #TASK-6722 --- .../analysis/variant/VariantWalkerTool.java | 6 ++-- .../AnalysisVariantCommandExecutor.java | 3 +- .../AnalysisVariantCommandOptions.java | 17 +++++------ .../models/variant/VariantWalkerParams.java | 30 +++++++++---------- .../walker/HadoopVariantWalkerTest.java | 28 ++++++++++------- 5 files changed, 43 insertions(+), 41 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java index 0dd6f10c344..5ab80d4f57e 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java @@ -49,11 +49,11 @@ public class VariantWalkerTool extends OpenCgaTool { protected void check() throws Exception { super.check(); - if (StringUtils.isEmpty(toolParams.getFileFormat())) { - toolParams.setFileFormat(VariantWriterFactory.VariantOutputFormat.VCF.toString()); + if (StringUtils.isEmpty(toolParams.getInputFormat())) { + toolParams.setInputFormat(VariantWriterFactory.VariantOutputFormat.VCF.toString()); } - format = VariantWriterFactory.toOutputFormat(toolParams.getFileFormat(), toolParams.getOutputFileName()); + format = VariantWriterFactory.toOutputFormat(toolParams.getInputFormat(), toolParams.getOutputFileName()); if (format.isBinary()) { throw new IllegalArgumentException("Binary format not supported for VariantWalkerTool"); } diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/AnalysisVariantCommandExecutor.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/AnalysisVariantCommandExecutor.java index 3e3d5cd7528..1410934a499 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/AnalysisVariantCommandExecutor.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/AnalysisVariantCommandExecutor.java @@ -1990,9 +1990,8 @@ private RestResponse runWalker() throws Exception { putNestedIfNotEmpty(beanParams, "unknownGenotype", commandOptions.unknownGenotype, true); putNestedIfNotNull(beanParams, "sampleMetadata", commandOptions.sampleMetadata, true); putNestedIfNotNull(beanParams, "sort", commandOptions.sort, true); - putNestedIfNotEmpty(beanParams, "outdir", commandOptions.outdir, true); putNestedIfNotEmpty(beanParams, "outputFileName", commandOptions.outputFileName, true); - putNestedIfNotEmpty(beanParams, "fileFormat", commandOptions.fileFormat, true); + putNestedIfNotEmpty(beanParams, "inputFormat", commandOptions.inputFormat, true); putNestedIfNotEmpty(beanParams, "dockerImage", commandOptions.dockerImage, true); putNestedIfNotEmpty(beanParams, "commandLine", commandOptions.commandLine, true); putNestedIfNotEmpty(beanParams, "include", commandOptions.bodyInclude, true); diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/AnalysisVariantCommandOptions.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/AnalysisVariantCommandOptions.java index 998a7dc510b..24a37e04229 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/AnalysisVariantCommandOptions.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/AnalysisVariantCommandOptions.java @@ -3113,25 +3113,22 @@ public class RunWalkerCommandOptions { @Parameter(names = {"--sort"}, description = "The body web service sort parameter", required = false, help = true, arity = 0) public boolean sort = false; - @Parameter(names = {"--outdir"}, description = "The body web service outdir parameter", required = false, arity = 1) - public String outdir; - - @Parameter(names = {"--output-file-name"}, description = "The body web service outputFileName parameter", required = false, arity = 1) + @Parameter(names = {"--output-file-name"}, description = "Output file name", required = false, arity = 1) public String outputFileName; - @Parameter(names = {"--file-format"}, description = "The body web service fileFormat parameter", required = false, arity = 1) - public String fileFormat; + @Parameter(names = {"--input-format"}, description = "Format that will be used as input for the variant walker", required = false, arity = 1) + public String inputFormat; - @Parameter(names = {"--docker-image"}, description = "The body web service dockerImage parameter", required = false, arity = 1) + @Parameter(names = {"--docker-image"}, description = "Docker image to use", required = false, arity = 1) public String dockerImage; - @Parameter(names = {"--command-line"}, description = "The body web service commandLine parameter", required = false, arity = 1) + @Parameter(names = {"--command-line"}, description = "Command line to execute from the walker", required = false, arity = 1) public String commandLine; - @Parameter(names = {"--body_include"}, description = "The body web service include parameter", required = false, arity = 1) + @Parameter(names = {"--body_include"}, description = "Fields included in the response, whole JSON path must be provided", required = false, arity = 1) public String bodyInclude; - @Parameter(names = {"--body_exclude"}, description = "The body web service exclude parameter", required = false, arity = 1) + @Parameter(names = {"--body_exclude"}, description = "Fields excluded in the response, whole JSON path must be provided", required = false, arity = 1) public String bodyExclude; } diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantWalkerParams.java b/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantWalkerParams.java index ef541690fc9..22a3d51f57e 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantWalkerParams.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantWalkerParams.java @@ -1,24 +1,24 @@ package org.opencb.opencga.core.models.variant; +import org.opencb.commons.annotations.DataField; +import org.opencb.opencga.core.api.ParamConstants; + public class VariantWalkerParams extends VariantQueryParams { public static final String DESCRIPTION = "Variant walker params"; - private String outdir; + + @DataField(description = "Output file name") private String outputFileName; - private String fileFormat; + @DataField(description = "Format that will be used as input for the variant walker") + private String inputFormat; + @DataField(description = "Docker image to use") private String dockerImage; + @DataField(description = "Command line to execute from the walker") private String commandLine; + @DataField(description = ParamConstants.INCLUDE_DESCRIPTION) private String include; + @DataField(description = ParamConstants.EXCLUDE_DESCRIPTION) private String exclude; - public String getOutdir() { - return outdir; - } - - public VariantWalkerParams setOutdir(String outdir) { - this.outdir = outdir; - return this; - } - public String getOutputFileName() { return outputFileName; } @@ -28,12 +28,12 @@ public VariantWalkerParams setOutputFileName(String outputFileName) { return this; } - public String getFileFormat() { - return fileFormat; + public String getInputFormat() { + return inputFormat; } - public VariantWalkerParams setFileFormat(String fileFormat) { - this.fileFormat = fileFormat; + public VariantWalkerParams setInputFormat(String inputFormat) { + this.inputFormat = inputFormat; return this; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/walker/HadoopVariantWalkerTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/walker/HadoopVariantWalkerTest.java index 02606a01fff..0d9ab975ed7 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/walker/HadoopVariantWalkerTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/walker/HadoopVariantWalkerTest.java @@ -1,10 +1,7 @@ package org.opencb.opencga.storage.hadoop.variant.walker; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.ClassRule; -import org.junit.Test; +import org.junit.*; import org.junit.experimental.categories.Category; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; @@ -34,7 +31,7 @@ public class HadoopVariantWalkerTest extends VariantStorageBaseTest implements H @ClassRule public static HadoopExternalResource externalResource = new HadoopExternalResource(); - private static String _dockerImage; + private static String dockerImage; @Before public void before() throws Exception { @@ -58,6 +55,14 @@ public static void beforeClass() throws Exception { ); VariantHbaseTestUtils.printVariants(variantStorageManager.getDBAdaptor(), newOutputUri()); + + dockerImage = buildDocker(); + } + + @AfterClass + public static void afterClass() throws Exception { + pruneDocker(dockerImage); + dockerImage = null; } @Test @@ -102,7 +107,6 @@ public void exportCommand() throws Exception { @Test public void exportDocker() throws Exception { URI outdir = newOutputUri(); - String dockerImage = buildDocker(); String cmdPython1 = "python variant_walker.py walker_example Cut --length 30"; @@ -110,9 +114,6 @@ public void exportDocker() throws Exception { } private static String buildDocker() throws IOException { - if (HadoopVariantWalkerTest._dockerImage != null) { - return HadoopVariantWalkerTest._dockerImage; - } String dockerImage = "local/variant-walker-test:latest"; Path dockerFile = Paths.get(getResourceUri("variantWalker/Dockerfile").getPath()); // Path pythonDir = Paths.get("../../opencga-storage-core/src/main/python").toAbsolutePath(); @@ -120,9 +121,14 @@ private static String buildDocker() throws IOException { Command dockerBuild = new Command(new String[]{"docker", "build", "-t", dockerImage, "-f", dockerFile.toString(), pythonDir.toString()}, Collections.emptyMap()); dockerBuild.run(); assertEquals(0, dockerBuild.getExitValue()); - HadoopVariantWalkerTest._dockerImage = dockerImage; return dockerImage; } - + private static void pruneDocker(String dockerImage) throws IOException { + if (dockerImage != null) { + Command dockerPrune = new Command(new String[]{"docker", "rmi", dockerImage}, Collections.emptyMap()); + dockerPrune.run(); + assertEquals(0, dockerPrune.getExitValue()); + } + } } From 48e15923b918fa11f3094d6720da1e2e524a9e0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 28 Nov 2024 17:11:09 +0000 Subject: [PATCH 47/66] storage: Fix NPE running SampleVariantStats #TASK-6722 --- .../variant/stats/SampleVariantStatsAnalysis.java | 12 ++++++++++-- .../analysis/variant/VariantAnalysisTest.java | 5 +++++ ...leVariantStatsHBaseMapReduceAnalysisExecutor.java | 5 ++++- .../hadoop/variant/mr/VariantMapReduceUtil.java | 4 +++- 4 files changed, 22 insertions(+), 4 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/stats/SampleVariantStatsAnalysis.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/stats/SampleVariantStatsAnalysis.java index 3d91b877bdd..a12072e3a13 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/stats/SampleVariantStatsAnalysis.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/stats/SampleVariantStatsAnalysis.java @@ -26,6 +26,7 @@ import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.opencga.analysis.tools.OpenCgaToolScopeStudy; +import org.opencb.opencga.analysis.variant.manager.VariantCatalogQueryUtils; import org.opencb.opencga.catalog.db.api.SampleDBAdaptor; import org.opencb.opencga.core.api.ParamConstants; import org.opencb.opencga.core.common.BatchUtils; @@ -62,6 +63,7 @@ public class SampleVariantStatsAnalysis extends OpenCgaToolScopeStudy { @ToolParams protected SampleVariantStatsAnalysisParams toolParams; private ArrayList checkedSamplesList; + private Query variantQuery; private SampleVariantStatsAnalysisExecutor toolExecutor; private List> batches; private int numBatches; @@ -165,6 +167,12 @@ protected void check() throws Exception { } } + + variantQuery = toolParams.getVariantQuery() == null ? new Query() : toolParams.getVariantQuery().toQuery(); + variantQuery.put(VariantQueryParam.STUDY.key(), study); + variantQuery = new VariantCatalogQueryUtils(catalogManager) + .parseQuery(variantQuery, new QueryOptions(), variantStorageManager.getCellBaseUtils(study, token), token); + checkedSamplesList = new ArrayList<>(allSamples); checkedSamplesList.sort(String::compareTo); if (checkedSamplesList.isEmpty()) { @@ -173,7 +181,7 @@ protected void check() throws Exception { } else { // check read permission variantStorageManager.checkQueryPermissions( - new Query() + new Query(variantQuery) .append(VariantQueryParam.STUDY.key(), study) .append(VariantQueryParam.INCLUDE_SAMPLE.key(), checkedSamplesList), new QueryOptions(), @@ -246,7 +254,7 @@ protected void run() throws ToolException { .setOutputFile(tmpOutputFile) .setStudy(study) .setSampleNames(batchSamples) - .setVariantQuery(toolParams.getVariantQuery() == null ? new Query() : toolParams.getVariantQuery().toQuery()) + .setVariantQuery(variantQuery) .execute(); if (tmpOutputFile != outputFile) { diff --git a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/VariantAnalysisTest.java b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/VariantAnalysisTest.java index f9e9392be80..8f3f9695a3c 100644 --- a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/VariantAnalysisTest.java +++ b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/VariantAnalysisTest.java @@ -429,6 +429,11 @@ public void testSampleStatsSampleFilter() throws Exception { new Query(VariantQueryParam.SAMPLE_DATA.key(), "DS>1;GT!=1|1")); } + @Test + public void testSampleStatsWithGeneFilter() throws Exception { + sampleVariantStats(null, "stats_BRCA1", false, 1, file.getSampleIds().subList(0, 2), false, new VariantQuery().gene("BRCA1")); + } + @Test public void testSampleStats() throws Exception { sampleVariantStats("1,2", "stats_1", false, 1, file.getSampleIds().subList(0, 2)); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/stats/SampleVariantStatsHBaseMapReduceAnalysisExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/stats/SampleVariantStatsHBaseMapReduceAnalysisExecutor.java index 41b8ea016ed..8b4b0175356 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/stats/SampleVariantStatsHBaseMapReduceAnalysisExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/stats/SampleVariantStatsHBaseMapReduceAnalysisExecutor.java @@ -1,11 +1,13 @@ package org.opencb.opencga.storage.hadoop.variant.analysis.stats; import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.opencga.core.exceptions.ToolException; import org.opencb.opencga.core.exceptions.ToolExecutorException; import org.opencb.opencga.core.tools.annotations.ToolExecutor; import org.opencb.opencga.core.tools.variant.SampleVariantStatsAnalysisExecutor; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryException; +import org.opencb.opencga.storage.core.variant.query.ParsedVariantQuery; import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.analysis.HadoopVariantStorageToolExecutor; @@ -43,8 +45,9 @@ public void run() throws ToolException { } } + ParsedVariantQuery variantQuery = engine.parseQuery(getVariantQuery(), new QueryOptions()); ObjectMap params = new ObjectMap(engine.getOptions()) - .appendAll(getVariantQuery()) + .appendAll(variantQuery.getQuery()) .append(SampleVariantStatsDriver.SAMPLES, sampleNames) .append(SampleVariantStatsDriver.OUTPUT, getOutputFile().toAbsolutePath().toUri()); engine.getMRExecutor().run(SampleVariantStatsDriver.class, SampleVariantStatsDriver.buildArgs( diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java index 46b059e05c7..56ecaf7c87a 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/VariantMapReduceUtil.java @@ -346,7 +346,9 @@ public static void initVariantRowMapperJobFromHBase(Job job, String variantTable job.setInputFormatClass(HBaseVariantRowTableInputFormat.class); job.getConfiguration().setBoolean(HBaseVariantRowTableInputFormat.MULTI_SCANS, scans.size() > 1); job.getConfiguration().setBoolean(HBaseVariantRowTableInputFormat.USE_SAMPLE_INDEX_TABLE_INPUT_FORMAT, useSampleIndex); - job.getConfiguration().set(HBaseVariantRowTableInputFormat.SAMPLE_INDEX_TABLE, sampleIndexTable); + if (sampleIndexTable != null) { + job.getConfiguration().set(HBaseVariantRowTableInputFormat.SAMPLE_INDEX_TABLE, sampleIndexTable); + } } public static void initVariantRowMapperJobFromPhoenix(Job job, VariantHadoopDBAdaptor dbAdaptor, From 1d86756f547bd4dc78ad7007006d4915ca16f696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 29 Nov 2024 09:39:45 +0000 Subject: [PATCH 48/66] storage: Fix CustomPhoenixInputFormat generateSplit for first and last splits. #TASK-6722 --- .../variant/mr/CustomPhoenixInputFormat.java | 45 ++++++++++++++----- .../stats/SampleVariantStatsDriver.java | 9 ++-- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/CustomPhoenixInputFormat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/CustomPhoenixInputFormat.java index b8e34933c95..30a1b0c6bc2 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/CustomPhoenixInputFormat.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/CustomPhoenixInputFormat.java @@ -8,6 +8,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.db.DBWritable; @@ -23,6 +24,7 @@ import org.apache.phoenix.util.PhoenixRuntime; import org.opencb.opencga.storage.hadoop.HBaseCompat; import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageOptions; +import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantPhoenixKeyFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -70,7 +72,22 @@ public void initialize(InputSplit split, TaskAttemptContext context) throws IOEx super.initialize(split, context); if (split instanceof PhoenixInputSplit) { PhoenixInputSplit phoenixInputSplit = (PhoenixInputSplit) split; - logger.info("Key range : " + phoenixInputSplit.getKeyRange()); + KeyRange keyRange = phoenixInputSplit.getKeyRange(); + logger.info("Key range : " + keyRange); + + try { + Pair chrPosStart = VariantPhoenixKeyFactory.extractChrPosFromVariantRowKey(keyRange.getLowerRange()); + Pair chrPosEnd = VariantPhoenixKeyFactory.extractChrPosFromVariantRowKey(keyRange.getUpperRange()); + logger.info("Variants key range : " + + (keyRange.isLowerInclusive() ? "[" : "(") + + chrPosStart.getFirst() + ":" + chrPosStart.getSecond() + + " - " + + chrPosEnd.getFirst() + ":" + chrPosEnd.getSecond() + + (keyRange.isUpperInclusive() ? "]" : ")")); + } catch (Exception e) { + logger.error("Error parsing key range: {}", e.getMessage()); + } + logger.info("Split: " + phoenixInputSplit.getScans().size() + " scans"); int i = 0; for (Scan scan : phoenixInputSplit.getScans()) { @@ -116,17 +133,23 @@ private List generateSplits(final QueryPlan qplan, final List splitScans = new ArrayList<>(numScans); Scan scan = scans.get(0); byte[] startRow = scan.getStartRow(); + if (startRow == null || startRow.length == 0) { + startRow = Bytes.toBytesBinary("1\\x00\\x00\\x00\\x00\\x00"); + logger.info("Scan with empty startRow. Set default start. " + + "[" + Bytes.toStringBinary(startRow) + "-" + Bytes.toStringBinary(scan.getStopRow()) + ")"); + } byte[] stopRow = scan.getStopRow(); - if (startRow != null && startRow.length != 0 && stopRow != null && stopRow.length != 0) { - byte[][] ranges = Bytes.split(startRow, stopRow, numScans - 1); - for (int i = 1; i < ranges.length; i++) { - Scan splitScan = new Scan(scan); - splitScan.withStartRow(ranges[i - 1]); - splitScan.withStopRow(ranges[i], false); - splitScans.add(splitScan); - } - } else { - splitScans.add(scan); + if (stopRow == null || stopRow.length == 0) { + stopRow = Bytes.toBytesBinary("Z\\x00\\x00\\x00\\x00\\x00"); + logger.info("Scan with empty stopRow. Set default stop. " + + "[" + Bytes.toStringBinary(startRow) + "-" + Bytes.toStringBinary(stopRow) + ")"); + } + byte[][] ranges = Bytes.split(startRow, stopRow, numScans - 1); + for (int i = 1; i < ranges.length; i++) { + Scan splitScan = new Scan(scan); + splitScan.withStartRow(ranges[i - 1]); + splitScan.withStopRow(ranges[i], false); + splitScans.add(splitScan); } for (Scan splitScan : splitScans) { psplits.add(new PhoenixInputSplit(Collections.singletonList(splitScan))); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java index 7ff4049afce..1c55569947f 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java @@ -67,6 +67,7 @@ public class SampleVariantStatsDriver extends VariantTableAggregationDriver { private String trios; private String fileData; private String sampleData; + private Set includeSample; @Override protected Map getParams() { @@ -91,7 +92,7 @@ protected void parseAndValidateParameters() throws IOException { List samples = Arrays.asList(samplesStr.split(",")); StringBuilder trios = new StringBuilder(); - Set includeSample = new LinkedHashSet<>(); + includeSample = new LinkedHashSet<>(); if (samples.size() == 1 && (samples.get(0).equals("auto") || samples.get(0).equals("all"))) { boolean all = samples.get(0).equals("all"); metadataManager.sampleMetadataIterator(studyId).forEachRemaining(sampleMetadata -> { @@ -101,16 +102,18 @@ protected void parseAndValidateParameters() throws IOException { } } }); + sampleIds = new ArrayList<>(includeSample); } else { + sampleIds = new ArrayList<>(samples.size()); for (String sample : samples) { Integer sampleId = metadataManager.getSampleId(studyId, sample); if (sampleId == null) { throw VariantQueryException.sampleNotFound(sample, metadataManager.getStudyName(studyId)); } + sampleIds.add(sampleId); addTrio(trios, includeSample, metadataManager.getSampleMetadata(studyId, sampleId)); } } - sampleIds = new ArrayList<>(includeSample); if (sampleIds.isEmpty()) { throw new IllegalArgumentException("Nothing to do!"); } @@ -172,7 +175,7 @@ private static Pedigree readPedigree(Configuration conf) { protected Query getQuery() { Query query = super.getQuery() .append(VariantQueryParam.STUDY.key(), getStudyId()) - .append(VariantQueryParam.INCLUDE_SAMPLE.key(), sampleIds); + .append(VariantQueryParam.INCLUDE_SAMPLE.key(), includeSample); query.remove(VariantQueryParam.SAMPLE_DATA.key()); query.remove(VariantQueryParam.FILE_DATA.key()); return query; From 5141031b098e61a5aa7d9b67d43faa941517c995 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 29 Nov 2024 09:40:13 +0000 Subject: [PATCH 49/66] analysis: Fix NPE at relatedness tool. #TASK-6722 --- .../opencga/analysis/family/qc/IBDComputation.java | 9 +++++---- .../wrappers/plink/PlinkWrapperAnalysisExecutor.java | 11 +++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/family/qc/IBDComputation.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/family/qc/IBDComputation.java index 7277c3d2429..7488510def0 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/family/qc/IBDComputation.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/family/qc/IBDComputation.java @@ -32,6 +32,7 @@ import org.opencb.opencga.analysis.variant.relatedness.RelatednessAnalysis; import org.opencb.opencga.analysis.wrappers.plink.PlinkWrapperAnalysisExecutor; import org.opencb.opencga.catalog.exceptions.CatalogException; +import org.opencb.opencga.core.config.Analysis; import org.opencb.opencga.core.exceptions.ToolException; import org.opencb.opencga.core.models.family.Family; import org.opencb.opencga.core.models.individual.Individual; @@ -117,7 +118,7 @@ public static RelatednessReport compute(String study, Family family, List> inputBindings = new ArrayList<>(); inputBindings.add(new AbstractMap.SimpleEntry<>(freqPath.getParent().toString(), "/input")); @@ -311,8 +312,8 @@ private static File runIBD(String basename, Path freqPath, Path outDir) throws T String plinkParams = "plink1.9 --tfile /output/" + basename + " --genome rel-check --read-freq /input/" + FREQ_FILENAME + " --out /output/" + basename; try { - PlinkWrapperAnalysisExecutor plinkExecutor = new PlinkWrapperAnalysisExecutor(); - DockerUtils.run(plinkExecutor.getDockerImageName(), inputBindings, outputBinding, plinkParams, null); + String dockerImageName = PlinkWrapperAnalysisExecutor.getDockerImageName(analysisConf); + DockerUtils.run(dockerImageName, inputBindings, outputBinding, plinkParams, null); } catch (IOException e) { throw new ToolException(e); } diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/wrappers/plink/PlinkWrapperAnalysisExecutor.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/wrappers/plink/PlinkWrapperAnalysisExecutor.java index 890475b40fc..ffe17a02a72 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/wrappers/plink/PlinkWrapperAnalysisExecutor.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/wrappers/plink/PlinkWrapperAnalysisExecutor.java @@ -3,6 +3,8 @@ import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.opencb.opencga.analysis.wrappers.executors.DockerWrapperAnalysisExecutor; +import org.opencb.opencga.core.config.Analysis; +import org.opencb.opencga.core.exceptions.ToolException; import org.opencb.opencga.core.tools.annotations.ToolExecutor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -21,6 +23,15 @@ public class PlinkWrapperAnalysisExecutor extends DockerWrapperAnalysisExecutor private Logger logger = LoggerFactory.getLogger(this.getClass()); + public static String getDockerImageName(Analysis analysisConf) throws ToolException { + return analysisConf.getOpencgaExtTools().split(":")[0]; + } + + @Override + public String getDockerImageName() throws ToolException { + return getDockerImageName(getConfiguration().getAnalysis()); + } + @Override protected void run() throws Exception { StringBuilder sb = initCommandLine(); From f2bc782de721f08a0c45c40b2c72c5fd9972e08f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 29 Nov 2024 10:14:40 +0000 Subject: [PATCH 50/66] cicd: Upload tests logs as artifacts. Reduce action log size. #TASK-6722 --- .github/workflows/test-analysis.yml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-analysis.yml b/.github/workflows/test-analysis.yml index 554e1f72520..76437255b57 100644 --- a/.github/workflows/test-analysis.yml +++ b/.github/workflows/test-analysis.yml @@ -87,10 +87,24 @@ jobs: with: mongodb-version: 6.0 mongodb-replica-set: rs-test - - name: Maven build + - name: Maven build (skip tests) run: mvn -B clean install -DskipTests -P ${{ inputs.hadoop }} -Dcheckstyle.skip ${{ inputs.mvn_opts }} + - name: Build Junit log file name + id: BuildJunitLogFileName + run: | + MODULE=$(basename ${{ inputs.module }}) + if [[ -z "$MODULE" ]]; then + MODULE="opencga" + fi + TAGS=$(echo ${{ inputs.test_profile }} | sed -e 's/run\([^,]*\)Tests/\1/g' | tr ',' '_' | tr '[:upper:]' '[:lower:]' ) + echo "TESTS_LOG_FILE_NAME=junit_${{ inputs.hadoop }}_${TAGS}_${MODULE}.log" >> $GITHUB_OUTPUT - name: Run Junit tests - run: mvn -B verify surefire-report:report --fail-never -Dsurefire.testFailureIgnore=true -f ${{ (inputs.module == '' || inputs.module == 'all') && '.' || inputs.module }} -P ${{ inputs.hadoop }},${{ inputs.test_profile }} -Dcheckstyle.skip ${{ inputs.mvn_opts }} + run: mvn -B verify surefire-report:report --fail-never -Dsurefire.testFailureIgnore=true -f ${{ (inputs.module == '' || inputs.module == 'all') && '.' || inputs.module }} -P ${{ inputs.hadoop }},${{ inputs.test_profile }} -Dcheckstyle.skip ${{ inputs.mvn_opts }} |& tee ${{ steps.BuildJunitLogFileName.outputs.TESTS_LOG_FILE_NAME }} |& grep -P '^\[[^\]]*(INFO|WARNING|ERROR)' --colour=never --line-buffered + - name: Upload Junit test logs + uses: actions/upload-artifact@v4 + with: + name: ${{ steps.BuildJunitLogFileName.outputs.TESTS_LOG_FILE_NAME }} + path: ${{ steps.BuildJunitLogFileName.outputs.TESTS_LOG_FILE_NAME }} - name: Publish Test Report on GitHub uses: scacap/action-surefire-report@v1 env: From dd684aafdb433acb000367be8451f1b12bea9054 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 29 Nov 2024 10:19:45 +0000 Subject: [PATCH 51/66] storage: Fix NPE at CohortVariantStatsDriver. #TASK-6722 --- .../hadoop/variant/stats/CohortVariantStatsDriver.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/CohortVariantStatsDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/CohortVariantStatsDriver.java index b633508455f..b18e7cfa5ec 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/CohortVariantStatsDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/CohortVariantStatsDriver.java @@ -316,8 +316,14 @@ protected void map(Object key, VariantRow row, Context context) throws IOExcepti if (fileIds.contains(fileColumn.getFileId())) { if (fileColumn.getOverlappingStatus().equals(VariantOverlappingStatus.NONE)) { HashMap attributes = new HashMap<>(2); - attributes.put(StudyEntry.QUAL, fileColumn.getQualString()); - attributes.put(StudyEntry.FILTER, fileColumn.getFilter()); + String qualString = fileColumn.getQualString(); + if (qualString != null) { + attributes.put(StudyEntry.QUAL, qualString); + } + String filter = fileColumn.getFilter(); + if (filter != null) { + attributes.put(StudyEntry.FILTER, filter); + } entries.add(new FileEntry(String.valueOf(fileColumn.getFileId()), fileColumn.getCall(), attributes)); } } From 9795c6a67406614012049b44c5de0712124d6d1d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 29 Nov 2024 10:47:44 +0000 Subject: [PATCH 52/66] cicd: Fix NPE. #TASK-6722 --- .github/workflows/test-analysis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-analysis.yml b/.github/workflows/test-analysis.yml index 76437255b57..1962c89ddc7 100644 --- a/.github/workflows/test-analysis.yml +++ b/.github/workflows/test-analysis.yml @@ -92,7 +92,7 @@ jobs: - name: Build Junit log file name id: BuildJunitLogFileName run: | - MODULE=$(basename ${{ inputs.module }}) + MODULE=$(basename ${{ (inputs.module == '' || inputs.module == 'all') && 'opencga' || inputs.module }} ) if [[ -z "$MODULE" ]]; then MODULE="opencga" fi From 923651cb3e6b91266e51c036276f81c469734649 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 29 Nov 2024 10:52:57 +0000 Subject: [PATCH 53/66] storage: Fix AIOOBE SampleVariantStatsDriver #TASK-6722 --- .../stats/SampleVariantStatsDriver.java | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java index 1c55569947f..bdf04ff741e 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java @@ -63,11 +63,15 @@ public class SampleVariantStatsDriver extends VariantTableAggregationDriver { private static final String STATS_OPERATION_NAME = "sample_stats"; private static final String FIXED_FORMAT = "FIXED_FORMAT"; private static final String FIXED_FILE_ATTRIBUTES = "FIXED_FILE_ATTRIBUTES"; + // List of sampleIds to calculate stats private List sampleIds; + // List of sampleIds to include in the query needed to calculate stats. Might include parents + private Set includeSample; private String trios; private String fileData; private String sampleData; - private Set includeSample; + public static final String SAMPLE_IDS = "SampleVariantStatsDriver.sample_ids"; + public static final String INCLUDE_SAMPLE_IDS = "SampleVariantStatsDriver.include_sample_ids"; @Override protected Map getParams() { @@ -244,7 +248,8 @@ protected Job setupJob(Job job, String archiveTable, String variantTable) throws List fixedFormat = HBaseToVariantConverter.getFixedFormat(studyMetadata); List fileAttributes = HBaseToVariantConverter.getFixedAttributes(studyMetadata); - job.getConfiguration().set(SAMPLES, sampleIds.stream().map(Objects::toString).collect(Collectors.joining(","))); + job.getConfiguration().set(SAMPLE_IDS, sampleIds.stream().map(Objects::toString).collect(Collectors.joining(","))); + job.getConfiguration().set(INCLUDE_SAMPLE_IDS, includeSample.stream().map(Objects::toString).collect(Collectors.joining(","))); job.getConfiguration().setStrings(FIXED_FORMAT, fixedFormat.toArray(new String[0])); job.getConfiguration().setStrings(FIXED_FILE_ATTRIBUTES, fileAttributes.toArray(new String[0])); if (StringUtils.isNotEmpty(fileData)) { @@ -364,10 +369,12 @@ public List getWritables() { } } + public static class SampleVariantStatsMapper extends VariantRowMapper { private int studyId; private int[] samples; + private int[] includeSamples; protected final Logger logger = LoggerFactory.getLogger(SampleVariantStatsMapper.class); private VariantStorageMetadataManager vsm; @@ -384,8 +391,9 @@ public static class SampleVariantStatsMapper extends VariantRowMapper Date: Fri, 29 Nov 2024 11:00:57 +0000 Subject: [PATCH 54/66] storage: Do not produce a .crc checksum file copying from hdfs. #TASK-6722 --- .../opencga/storage/hadoop/utils/MapReduceOutputFile.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java index e80a2d07da6..b1a7e19397b 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java @@ -360,7 +360,9 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool LOGGER.info(" Target {}: {}", getCompression(localOutput.getName()), localOutput.toUri()); LOGGER.info(" ---- "); - try (OutputStream os = getOutputStreamPlain(localOutput.getName(), localOutput.getFileSystem(getConf()).create(localOutput))) { + FileSystem localFfileSystem = localOutput.getFileSystem(getConf()); + localFfileSystem.setWriteChecksum(false); + try (OutputStream os = getOutputStreamPlain(localOutput.getName(), localFfileSystem.create(localOutput))) { for (int i = 0; i < paths.size(); i++) { Path partFile = paths.get(i); long partFileSize = fileSystem.getFileStatus(partFile).getLen(); From 9f326d90c012140016e5e3f712b387cacd06732e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 29 Nov 2024 11:34:19 +0000 Subject: [PATCH 55/66] storage: Improve docker process failure. Do not close the stdin twice. #TASK-6722 --- .../variant/mr/StreamVariantMapper.java | 34 ++++++++++++++++--- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 95d0e0fb8cd..081f512a6d7 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -308,13 +308,32 @@ private void closeProcess(Context context, boolean closeOutputs) throws IOExcept variantDataWriter.post(); variantDataWriter.close(); } + } catch (Throwable th) { + addException(th); + } finally { + variantDataWriter = null; + } - // Close stdin to the process. This will cause the process to finish. - if (stdin != null) { + try { + // Close the stream to the process + // This will cause the process to finish + // (if the process is reading from stdin, it will receive EOF) + // If the process has already finished, the stdin.close() will throw an exception + if (stdin != null && process.isAlive()) { stdin.close(); - stdin = null; } + } catch (Throwable th) { + if (th instanceof IOException && "Stream closed".equals(th.getMessage())) { + // Ignore "Stream closed" exception + } else { + addException(th); + } + } finally { + // Clear stdin even if it fails to avoid closing it twice + stdin = null; + } + try { if (process != null) { // Wait for the process to finish int exitVal = process.waitFor(); @@ -333,19 +352,24 @@ private void closeProcess(Context context, boolean closeOutputs) throws IOExcept if (stdout != null) { stdoutThread.join(); stdout.close(); - stdout = null; } } catch (Throwable th) { addException(th); + } finally { + // Clear stdout even if it fails to avoid closing it twice + stdout = null; } + try { if (stderr != null) { stderrThread.join(); stderr.close(); - stderr = null; } } catch (Throwable th) { addException(th); + } finally { + // Clear stderr even if it fails to avoid closing it twice + stderr = null; } try { From 627e56a9e1c8f4f4d9eeffca670cdbc4ff2c0886 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 29 Nov 2024 11:54:59 +0000 Subject: [PATCH 56/66] storage: Fix AIOOBE SampleVariantStatsDriver #TASK-6722 --- .../variant/mr/CustomPhoenixInputFormat.java | 19 ++++++--- .../stats/SampleVariantStatsDriver.java | 39 ++++++++++++------- 2 files changed, 38 insertions(+), 20 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/CustomPhoenixInputFormat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/CustomPhoenixInputFormat.java index 30a1b0c6bc2..b1b37eba8f7 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/CustomPhoenixInputFormat.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/CustomPhoenixInputFormat.java @@ -125,26 +125,27 @@ private List generateSplits(final QueryPlan qplan, final List psplits = Lists.newArrayListWithExpectedSize(splits.size()); + int undividedSplits = 0; + int numScanSplit = configuration.getInt(HadoopVariantStorageOptions.MR_HBASE_PHOENIX_SCAN_SPLIT.key(), + HadoopVariantStorageOptions.MR_HBASE_PHOENIX_SCAN_SPLIT.defaultValue()); for (List scans : qplan.getScans()) { if (scans.size() == 1) { // Split scans into multiple smaller scans - int numScans = configuration.getInt(HadoopVariantStorageOptions.MR_HBASE_PHOENIX_SCAN_SPLIT.key(), - HadoopVariantStorageOptions.MR_HBASE_PHOENIX_SCAN_SPLIT.defaultValue()); - List splitScans = new ArrayList<>(numScans); + List splitScans = new ArrayList<>(numScanSplit); Scan scan = scans.get(0); byte[] startRow = scan.getStartRow(); if (startRow == null || startRow.length == 0) { startRow = Bytes.toBytesBinary("1\\x00\\x00\\x00\\x00\\x00"); logger.info("Scan with empty startRow. Set default start. " - + "[" + Bytes.toStringBinary(startRow) + "-" + Bytes.toStringBinary(scan.getStopRow()) + ")"); + + "[" + Bytes.toStringBinary(startRow) + " - " + Bytes.toStringBinary(scan.getStopRow()) + ")"); } byte[] stopRow = scan.getStopRow(); if (stopRow == null || stopRow.length == 0) { stopRow = Bytes.toBytesBinary("Z\\x00\\x00\\x00\\x00\\x00"); logger.info("Scan with empty stopRow. Set default stop. " - + "[" + Bytes.toStringBinary(startRow) + "-" + Bytes.toStringBinary(stopRow) + ")"); + + "[" + Bytes.toStringBinary(startRow) + " - " + Bytes.toStringBinary(stopRow) + ")"); } - byte[][] ranges = Bytes.split(startRow, stopRow, numScans - 1); + byte[][] ranges = Bytes.split(startRow, stopRow, numScanSplit - 1); for (int i = 1; i < ranges.length; i++) { Scan splitScan = new Scan(scan); splitScan.withStartRow(ranges[i - 1]); @@ -156,8 +157,14 @@ private List generateSplits(final QueryPlan qplan, final List 0) { + logger.info("There are " + undividedSplits + " splits that were not subdivided."); + } return psplits; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java index bdf04ff741e..4c68b6f9b64 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java @@ -381,7 +381,7 @@ public static class SampleVariantStatsMapper extends VariantRowMapper> fileToSampleIds = new HashMap<>(); private DistributedSampleVariantStatsCalculator calculator; private final HBaseToVariantAnnotationConverter annotationConverter = new HBaseToVariantAnnotationConverter(); - private int[] sampleIdsPosition; + private Map sampleIdsPosition; private int sampleDataDpIdx; private int fileDataDpIdx; private Predicate fileDataFilter; @@ -393,7 +393,6 @@ protected void setup(Context context) throws IOException, InterruptedException { studyId = context.getConfiguration().getInt(STUDY_ID, -1); samples = context.getConfiguration().getInts(SAMPLE_IDS); includeSamples = context.getConfiguration().getInts(INCLUDE_SAMPLE_IDS); - sampleIdsPosition = new int[IntStream.of(includeSamples).max().orElse(0) + 1]; String fileDataQuery = context.getConfiguration().get(VariantQueryParam.FILE_DATA.key()); String sampleDataQuery = context.getConfiguration().get(VariantQueryParam.SAMPLE_DATA.key()); @@ -404,9 +403,9 @@ protected void setup(Context context) throws IOException, InterruptedException { sampleDataDpIdx = fixedFormat.indexOf(VCFConstants.DEPTH_KEY); fileDataDpIdx = fileAttributes.indexOf(VCFConstants.DEPTH_KEY); - Arrays.fill(sampleIdsPosition, -1); + sampleIdsPosition = new HashMap<>(includeSamples.length); for (int i = 0; i < includeSamples.length; i++) { - sampleIdsPosition[includeSamples[i]] = i; + sampleIdsPosition.put(includeSamples[i], i); } Pedigree pedigree = readPedigree(context.getConfiguration()); @@ -424,11 +423,20 @@ private List getSamplesFromFileId(int fileId) { id -> { ArrayList sampleIds = new ArrayList<>(vsm.getFileMetadata(studyId, id).getSamples()); // Discard unused samples - sampleIds.removeIf(s -> sampleIdsPosition.length <= s || sampleIdsPosition[s] < 0); + sampleIds.removeIf(s -> !sampleIdsPosition.containsKey(s)); return sampleIds; }); } + private int getSamplePosition(Integer sampleId) { + Integer samplePosition = sampleIdsPosition.get(sampleId); + if (samplePosition == null) { + throw new IllegalStateException("Sample " + sampleId + " not found in includeSamples " + + Arrays.toString(includeSamples)); + } + return samplePosition; + } + @Override protected void map(Object key, VariantRow row, Context context) throws IOException, InterruptedException { VariantAnnotation[] annotation = new VariantAnnotation[1]; @@ -443,9 +451,10 @@ protected void map(Object key, VariantRow row, Context context) throws IOExcepti Variant variant = row.walker().onSample(sampleCell -> { int sampleId = sampleCell.getSampleId(); + int samplePosition = getSamplePosition(sampleId); if (!sampleDataFilter.test(sampleCell)) { // Invalidate sample - invalidSamples[sampleIdsPosition[sampleId]] = true; + invalidSamples[samplePosition] = true; return; } @@ -454,19 +463,19 @@ protected void map(Object key, VariantRow row, Context context) throws IOExcepti if (gt == null || gt.isEmpty()) { // This is a really weird situation, most likely due to errors in the input files logger.error("Empty genotype at sample " + sampleId + " in variant " + row.getVariant()); - gts.set(sampleIdsPosition[sampleId], GenotypeClass.NA_GT_VALUE); + gts.set(samplePosition, GenotypeClass.NA_GT_VALUE); } else if (gt.equals(GenotypeClass.UNKNOWN_GENOTYPE)) { // skip unknown genotypes context.getCounter(COUNTER_GROUP_NAME, "unknownGt").increment(1); } else { - gts.set(sampleIdsPosition[sampleId], gt); + gts.set(samplePosition, gt); } if (sampleDataDpIdx > 0) { String dp = sampleCell.getSampleData(sampleDataDpIdx); // Do not set invalid values if (StringUtils.isNumeric(dp)) { - dps.set(sampleIdsPosition[sampleId], dp); + dps.set(samplePosition, dp); } } }).onFile(fileCell -> { @@ -474,15 +483,16 @@ protected void map(Object key, VariantRow row, Context context) throws IOExcepti if (fileDataFilter.test(fileCell)) { for (Integer sampleId : getSamplesFromFileId(fileId)) { - filters.set(sampleIdsPosition[sampleId], fileCell.getFilter()); - quals.set(sampleIdsPosition[sampleId], fileCell.getQualString()); + int samplePosition = getSamplePosition(sampleId); + filters.set(samplePosition, fileCell.getFilter()); + quals.set(samplePosition, fileCell.getQualString()); if (fileDataDpIdx > 0) { String dp = fileCell.getFileData(fileDataDpIdx); // Do not set invalid values if (StringUtils.isNumeric(dp)) { // Prioritize DP value from FORMAT. Do not overwrite if present. - if (StringUtils.isEmpty(dps.get(sampleIdsPosition[sampleId]))) { - dps.set(sampleIdsPosition[sampleId], dp); + if (StringUtils.isEmpty(dps.get(samplePosition))) { + dps.set(samplePosition, dp); } } } @@ -490,7 +500,8 @@ protected void map(Object key, VariantRow row, Context context) throws IOExcepti } else { // Invalidate samples from this file for (Integer sampleId : getSamplesFromFileId(fileId)) { - invalidSamples[sampleIdsPosition[sampleId]] = true; + int samplePosition = getSamplePosition(sampleId); + invalidSamples[samplePosition] = true; } } }).onVariantAnnotation(variantAnnotationColumn -> { From 98ce6f8a333744ee2da22bdb7b3297a0ab91e704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 29 Nov 2024 12:07:00 +0000 Subject: [PATCH 57/66] storage: Do not produce a .crc checksum file copying from hdfs. #TASK-6722 --- .../opencga/storage/hadoop/utils/MapReduceOutputFile.java | 7 ++++--- .../storage/hadoop/variant/mr/StreamVariantMapper.java | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java index b1a7e19397b..64b7223807b 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/MapReduceOutputFile.java @@ -344,6 +344,9 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool } } } + FileSystem localFileSystem = localOutput.getFileSystem(getConf()); + localFileSystem.setWriteChecksum(false); + StopWatch stopWatch = new StopWatch(); stopWatch.start(); if (paths.isEmpty()) { @@ -360,9 +363,7 @@ protected List concatMrOutputToLocal(Path mrOutdir, Path localOutput, bool LOGGER.info(" Target {}: {}", getCompression(localOutput.getName()), localOutput.toUri()); LOGGER.info(" ---- "); - FileSystem localFfileSystem = localOutput.getFileSystem(getConf()); - localFfileSystem.setWriteChecksum(false); - try (OutputStream os = getOutputStreamPlain(localOutput.getName(), localFfileSystem.create(localOutput))) { + try (OutputStream os = getOutputStreamPlain(localOutput.getName(), localFileSystem.create(localOutput))) { for (int i = 0; i < paths.size(); i++) { Path partFile = paths.get(i); long partFileSize = fileSystem.getFileStatus(partFile).getLen(); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 081f512a6d7..89758279e05 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -248,8 +248,8 @@ private void throwExceptionIfAny() throws IOException { if (hasExceptions()) { String message = "StreamVariantMapper failed:"; if (stderrThread != null) { - String stderr = String.join("\n", stderrThread.stderrBuffer); - message += "\nSTDERR: " + stderr; + String stderr = String.join("\n[STDERR] - ", stderrThread.stderrBuffer); + message += "\n[STDERR] - " + stderr; } if (throwables.size() == 1) { Throwable cause = throwables.get(0); From 14c07d90c63da3a324a06673fbdecec27ee6fa20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 29 Nov 2024 12:27:55 +0000 Subject: [PATCH 58/66] analysis: Do not use the scratchDir as intermediate folder for export and walk. #TASK-6722 --- .../analysis/variant/VariantExportTool.java | 23 +++-------------- .../analysis/variant/VariantWalkerTool.java | 25 +++---------------- .../VariantInternalCommandExecutor.java | 2 +- .../models/variant/VariantExportParams.java | 15 ++++++----- 4 files changed, 16 insertions(+), 49 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantExportTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantExportTool.java index b0a2005ac18..b1c49aeafa6 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantExportTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantExportTool.java @@ -20,8 +20,6 @@ import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.opencga.analysis.tools.OpenCgaTool; -import org.opencb.opencga.catalog.io.IOManager; -import org.opencb.opencga.core.common.UriUtils; import org.opencb.opencga.core.models.common.Enums; import org.opencb.opencga.core.models.variant.VariantExportParams; import org.opencb.opencga.core.tools.annotations.Tool; @@ -29,9 +27,7 @@ import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; -import java.net.URI; import java.nio.file.Path; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -70,12 +66,8 @@ protected List getSteps() { @Override protected void run() throws Exception { - List uris = new ArrayList<>(2); step(ID, () -> { - // Use scratch directory to store intermediate files. Move files to final directory at the end - // The scratch directory is expected to be faster than the final directory - // This also avoids moving files to final directory if the tool fails - Path outDir = getScratchDir(); + Path outDir = getOutDir(); String outputFile = StringUtils.isEmpty(toolParams.getOutputFileName()) ? outDir.toString() : outDir.resolve(toolParams.getOutputFileName()).toString(); @@ -84,18 +76,9 @@ protected void run() throws Exception { for (VariantQueryParam param : VariantQueryParam.values()) { queryOptions.remove(param.key()); } - uris.addAll(variantStorageManager.exportData(outputFile, + variantStorageManager.exportData(outputFile, outputFormat, - toolParams.getVariantsFile(), query, queryOptions, token)); - }); - step("move-files", () -> { - // Move files to final directory - IOManager ioManager = catalogManager.getIoManagerFactory().get(uris.get(0)); - for (URI uri : uris) { - String fileName = UriUtils.fileName(uri); - logger.info("Moving file -- " + fileName); - ioManager.move(uri, getOutDir().resolve(fileName).toUri()); - } + toolParams.getVariantsFile(), query, queryOptions, token); }); } } diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java index 5ab80d4f57e..109a9d4b5a7 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/VariantWalkerTool.java @@ -20,17 +20,13 @@ import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.opencga.analysis.tools.OpenCgaTool; -import org.opencb.opencga.catalog.io.IOManager; -import org.opencb.opencga.core.common.UriUtils; import org.opencb.opencga.core.models.common.Enums; import org.opencb.opencga.core.models.variant.VariantWalkerParams; import org.opencb.opencga.core.tools.annotations.Tool; import org.opencb.opencga.core.tools.annotations.ToolParams; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; -import java.net.URI; import java.nio.file.Path; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -75,29 +71,14 @@ protected List getSteps() { @Override protected void run() throws Exception { - List uris = new ArrayList<>(2); step(ID, () -> { - // Use scratch directory to store intermediate files. Move files to final directory at the end - // The scratch directory is expected to be faster than the final directory - // This also avoids moving files to final directory if the tool fails - Path outDir = getScratchDir(); + Path outDir = getOutDir(); String outputFile = outDir.resolve(toolParams.getOutputFileName()).toString(); Query query = toolParams.toQuery(); QueryOptions queryOptions = new QueryOptions().append(QueryOptions.INCLUDE, toolParams.getInclude()) .append(QueryOptions.EXCLUDE, toolParams.getExclude()); - uris.addAll(variantStorageManager.walkData(outputFile, - format, query, queryOptions, toolParams.getDockerImage(), toolParams.getCommandLine(), token)); - }); - step("move-files", () -> { - // Move files to final directory - if (!uris.isEmpty()) { - IOManager ioManager = catalogManager.getIoManagerFactory().get(uris.get(0)); - for (URI uri : uris) { - String fileName = UriUtils.fileName(uri); - logger.info("Moving file -- " + fileName); - ioManager.move(uri, getOutDir().resolve(fileName).toUri()); - } - } + variantStorageManager.walkData(outputFile, + format, query, queryOptions, toolParams.getDockerImage(), toolParams.getCommandLine(), token); }); } } diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/internal/executors/VariantInternalCommandExecutor.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/internal/executors/VariantInternalCommandExecutor.java index 7d20deccb09..f59e9178e9a 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/internal/executors/VariantInternalCommandExecutor.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/internal/executors/VariantInternalCommandExecutor.java @@ -341,7 +341,7 @@ private void query(VariantCommandOptions.AbstractVariantQueryCommandOptions cliO queryOptions.putIfNotEmpty("annotations", cliOptions.genericVariantQueryOptions.annotations); VariantExportParams toolParams = new VariantExportParams( - query, outdir, + query, cliOptions.outputFileName, cliOptions.outputFileFormat, cliOptions.variantsFile); diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantExportParams.java b/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantExportParams.java index 7e5c1870e8d..3f00adc7be3 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantExportParams.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantExportParams.java @@ -16,12 +16,13 @@ package org.opencb.opencga.core.models.variant; +import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonInclude; import org.opencb.commons.datastore.core.Query; public class VariantExportParams extends VariantQueryParams { public static final String DESCRIPTION = "Variant export params"; - private String outdir; + private String outputFileName; private String outputFileFormat; private String variantsFile; @@ -35,21 +36,23 @@ public class VariantExportParams extends VariantQueryParams { public VariantExportParams() { } - public VariantExportParams(Query query, String outdir, String outputFileName, String outputFileFormat, + public VariantExportParams(Query query, String outputFileName, String outputFileFormat, String variantsFile) { super(query); - this.outdir = outdir; this.outputFileName = outputFileName; this.outputFileFormat = outputFileFormat; this.variantsFile = variantsFile; } + @Deprecated + @JsonIgnore public String getOutdir() { - return outdir; + return null; } - public VariantExportParams setOutdir(String outdir) { - this.outdir = outdir; + @Deprecated + @JsonIgnore + public VariantExportParams setOutdir(String unused) { return this; } From 050c1ee08c74625bb6ef3158fb3dffa5b41e873c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 29 Nov 2024 12:42:21 +0000 Subject: [PATCH 59/66] storage: Improve collections usage in SampleVariantStatsDriver. #TASK-6722 --- .../stats/SampleVariantStatsDriver.java | 51 +++++++++++-------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java index 4c68b6f9b64..2a94116882e 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java @@ -48,7 +48,6 @@ import java.util.*; import java.util.function.Predicate; import java.util.stream.Collectors; -import java.util.stream.IntStream; import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine.STUDY_ID; @@ -341,10 +340,10 @@ public void merge(SampleVariantStatsWritable other) { public static class DistributedSampleVariantStatsCalculator extends SampleVariantStatsCalculator { - private int[] sampleIds; + private List sampleIds; - public DistributedSampleVariantStatsCalculator(Pedigree pedigree, int[] samples) { - super(pedigree, IntStream.of(samples).mapToObj(String::valueOf).collect(Collectors.toList())); + public DistributedSampleVariantStatsCalculator(Pedigree pedigree, List samples) { + super(pedigree, samples.stream().map(String::valueOf).collect(Collectors.toList())); sampleIds = samples; } @@ -363,7 +362,7 @@ public List getWritables() { List writables = new ArrayList<>(statsList.size()); for (int i = 0; i < statsList.size(); i++) { writables.add(new SampleVariantStatsWritable( - sampleIds[i], ti[i], tv[i], qualCount[i], qualSum[i], qualSumSq[i], statsList.get(i))); + sampleIds.get(i), ti[i], tv[i], qualCount[i], qualSum[i], qualSumSq[i], statsList.get(i))); } return writables; } @@ -373,8 +372,8 @@ public List getWritables() { public static class SampleVariantStatsMapper extends VariantRowMapper { private int studyId; - private int[] samples; - private int[] includeSamples; + private LinkedHashSet samples; + private LinkedHashSet includeSamples; protected final Logger logger = LoggerFactory.getLogger(SampleVariantStatsMapper.class); private VariantStorageMetadataManager vsm; @@ -391,8 +390,12 @@ public static class SampleVariantStatsMapper extends VariantRowMapper(samplesArray.length); + Arrays.stream(samplesArray).forEach(samples::add); + int[] includeSamplesArray = context.getConfiguration().getInts(INCLUDE_SAMPLE_IDS); + includeSamples = new LinkedHashSet<>(includeSamplesArray.length); + Arrays.stream(includeSamplesArray).forEach(includeSamples::add); String fileDataQuery = context.getConfiguration().get(VariantQueryParam.FILE_DATA.key()); String sampleDataQuery = context.getConfiguration().get(VariantQueryParam.SAMPLE_DATA.key()); @@ -403,13 +406,13 @@ protected void setup(Context context) throws IOException, InterruptedException { sampleDataDpIdx = fixedFormat.indexOf(VCFConstants.DEPTH_KEY); fileDataDpIdx = fileAttributes.indexOf(VCFConstants.DEPTH_KEY); - sampleIdsPosition = new HashMap<>(includeSamples.length); - for (int i = 0; i < includeSamples.length; i++) { - sampleIdsPosition.put(includeSamples[i], i); + sampleIdsPosition = new HashMap<>(includeSamples.size()); + for (Integer sampleId : includeSamples) { + sampleIdsPosition.put(sampleId, sampleIdsPosition.size()); } Pedigree pedigree = readPedigree(context.getConfiguration()); - calculator = new DistributedSampleVariantStatsCalculator(pedigree, samples); + calculator = new DistributedSampleVariantStatsCalculator(pedigree, new ArrayList<>(samples)); calculator.pre(); fileDataFilter = filterFactory.buildFileDataFilter(fileDataQuery); @@ -431,8 +434,7 @@ private List getSamplesFromFileId(int fileId) { private int getSamplePosition(Integer sampleId) { Integer samplePosition = sampleIdsPosition.get(sampleId); if (samplePosition == null) { - throw new IllegalStateException("Sample " + sampleId + " not found in includeSamples " - + Arrays.toString(includeSamples)); + throw new IllegalStateException("Sample " + sampleId + " not found in includeSamples " + includeSamples); } return samplePosition; } @@ -441,19 +443,24 @@ private int getSamplePosition(Integer sampleId) { protected void map(Object key, VariantRow row, Context context) throws IOException, InterruptedException { VariantAnnotation[] annotation = new VariantAnnotation[1]; - List gts = Arrays.asList(new String[samples.length]); - List dps = Arrays.asList(new String[samples.length]); - List quals = Arrays.asList(new String[samples.length]); - List filters = Arrays.asList(new String[samples.length]); + List gts = Arrays.asList(new String[includeSamples.size()]); + List dps = Arrays.asList(new String[includeSamples.size()]); + List quals = Arrays.asList(new String[includeSamples.size()]); + List filters = Arrays.asList(new String[includeSamples.size()]); // All samples valid by default. // If any filter (either sample-data or file-data) fails, then the sample would become invalid. - boolean[] invalidSamples = new boolean[samples.length]; + boolean[] invalidSamples = new boolean[includeSamples.size()]; Variant variant = row.walker().onSample(sampleCell -> { int sampleId = sampleCell.getSampleId(); int samplePosition = getSamplePosition(sampleId); - if (!sampleDataFilter.test(sampleCell)) { + if (sampleCell.getStudyId() != studyId || !includeSamples.contains(sampleId)) { + context.getCounter(COUNTER_GROUP_NAME, "unexpected_sample_discarded").increment(1); + return; + } + if (samples.contains(sampleId) && !sampleDataFilter.test(sampleCell)) { // Invalidate sample + // Do not invalidate extra samples, as might be used for calculating other stats invalidSamples[samplePosition] = true; return; } @@ -519,7 +526,7 @@ protected void map(Object key, VariantRow row, Context context) throws IOExcepti } } context.getCounter(COUNTER_GROUP_NAME, "variants_total").increment(1); - if (invalidSamplesCount == samples.length) { + if (invalidSamplesCount == samples.size()) { context.getCounter(COUNTER_GROUP_NAME, "variants_discarded").increment(1); } else { context.getCounter(COUNTER_GROUP_NAME, "variants_used").increment(1); From a0c2a5f4a1d2fe13d93bfc0dc36f04da13841d46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Mon, 2 Dec 2024 14:43:45 +0000 Subject: [PATCH 60/66] analysis: Fix VariantAnalysisTest. #TASK-6722 --- .../manager/VariantStorageManager.java | 4 +-- .../stats/SampleVariantStatsAnalysis.java | 3 +- .../analysis/variant/VariantAnalysisTest.java | 26 +++++++++++--- .../adaptors/phoenix/PhoenixHelper.java | 32 ++--------------- ...ntStatsHBaseMapReduceAnalysisExecutor.java | 11 ++++-- .../filters/VariantRowFilterFactory.java | 3 ++ .../stats/SampleVariantStatsDriver.java | 35 +++++++++++++------ 7 files changed, 62 insertions(+), 52 deletions(-) diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java index a5d02ab0205..9c718fa82a4 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java @@ -88,7 +88,6 @@ import org.opencb.opencga.storage.core.variant.VariantStorageOptions; import org.opencb.opencga.storage.core.variant.adaptors.*; import org.opencb.opencga.storage.core.variant.adaptors.iterators.VariantDBIterator; -import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat; import org.opencb.opencga.storage.core.variant.query.ParsedQuery; import org.opencb.opencga.storage.core.variant.query.VariantQueryResult; @@ -469,8 +468,9 @@ private CatalogStorageMetadataSynchronizer getSynchronizer(VariantStorageEngine return synchronizer; } - public DataResult familyIndexBySamples(String study, Collection samples, ObjectMap params, String token) + public DataResult familyIndexBySamples(String inputStudy, Collection samples, ObjectMap params, String token) throws CatalogException, StorageEngineException { + String study = getStudyFqn(inputStudy, token); return secureOperation(VariantFamilyIndexOperationTool.ID, study, params, token, engine -> { Collection thisSamples = samples; boolean allSamples; diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/stats/SampleVariantStatsAnalysis.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/stats/SampleVariantStatsAnalysis.java index a12072e3a13..540051b4a95 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/stats/SampleVariantStatsAnalysis.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/stats/SampleVariantStatsAnalysis.java @@ -43,6 +43,7 @@ import org.opencb.opencga.core.tools.annotations.Tool; import org.opencb.opencga.core.tools.annotations.ToolParams; import org.opencb.opencga.core.tools.variant.SampleVariantStatsAnalysisExecutor; +import org.opencb.opencga.storage.core.variant.adaptors.VariantQuery; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import java.io.OutputStream; @@ -254,7 +255,7 @@ protected void run() throws ToolException { .setOutputFile(tmpOutputFile) .setStudy(study) .setSampleNames(batchSamples) - .setVariantQuery(variantQuery) + .setVariantQuery(new VariantQuery(variantQuery).includeSample(batchSamples)) .execute(); if (tmpOutputFile != outputFile) { diff --git a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/VariantAnalysisTest.java b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/VariantAnalysisTest.java index 8f3f9695a3c..0a622c39823 100644 --- a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/VariantAnalysisTest.java +++ b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/VariantAnalysisTest.java @@ -80,10 +80,7 @@ import org.opencb.opencga.core.models.organizations.OrganizationUpdateParams; import org.opencb.opencga.core.models.project.ProjectCreateParams; import org.opencb.opencga.core.models.project.ProjectOrganism; -import org.opencb.opencga.core.models.sample.Sample; -import org.opencb.opencga.core.models.sample.SampleQualityControl; -import org.opencb.opencga.core.models.sample.SampleReferenceParam; -import org.opencb.opencga.core.models.sample.SampleUpdateParams; +import org.opencb.opencga.core.models.sample.*; import org.opencb.opencga.core.models.variant.*; import org.opencb.opencga.core.response.OpenCGAResult; import org.opencb.opencga.core.testclassification.duration.LongTests; @@ -184,7 +181,6 @@ public void setUp() throws Throwable { VariantOperationsTest.dummyVariantSetup(variantStorageManager, CANCER_STUDY, token); file = opencga.createFile(STUDY, "variant-test-file.vcf.gz", token); - variantStorageManager.index(STUDY, file.getId(), opencga.createTmpOutdir("_index"), new ObjectMap(VariantStorageOptions.ANNOTATE.key(), true), token); for (int i = 0; i < file.getSampleIds().size(); i++) { String id = file.getSampleIds().get(i); @@ -233,6 +229,9 @@ public void setUp() throws Throwable { individuals.stream().map(Individual::getId).collect(Collectors.toList()), new QueryOptions(), token); + variantStorageManager.index(STUDY, file.getId(), opencga.createTmpOutdir("_index"), new ObjectMap(VariantStorageOptions.ANNOTATE.key(), true), token); + variantStorageManager.familyIndexBySamples(STUDY, file.getSampleIds(), new ObjectMap(), token); + // Cancer (SV) ObjectMap config = new ObjectMap(); // config.put(VariantStorageOptions.ANNOTATE.key(), true); @@ -411,6 +410,7 @@ private java.io.File getOutputFile(Path outDir) { @Test public void testSampleStatsSampleFilter() throws Exception { + clearSampleVariantStats(); Assume.assumeThat(storageEngine, CoreMatchers.is(HadoopVariantStorageEngine.STORAGE_ENGINE_ID)); // Reset quality control stats for (Sample sample : catalogManager.getSampleManager().search(STUDY, new Query(), new QueryOptions(), token).getResults()) { @@ -431,11 +431,27 @@ public void testSampleStatsSampleFilter() throws Exception { @Test public void testSampleStatsWithGeneFilter() throws Exception { + clearSampleVariantStats(); sampleVariantStats(null, "stats_BRCA1", false, 1, file.getSampleIds().subList(0, 2), false, new VariantQuery().gene("BRCA1")); } + @Test + public void testSampleStatsFromOffspringFilter() throws Exception { + clearSampleVariantStats(); + sampleVariantStats(null, "stats_offspring", false, 1, Collections.singletonList(daughter)); + } + + private void clearSampleVariantStats() throws CatalogException { + for (String sampleId : file.getSampleIds()) { + SampleQualityControl qualityControl = catalogManager.getSampleManager().get(STUDY, sampleId, new QueryOptions(), token).first().getQualityControl(); + qualityControl.getVariant().getVariantStats().clear(); + catalogManager.getSampleManager().update(STUDY, sampleId, new SampleUpdateParams().setQualityControl(qualityControl), new QueryOptions(), token); + } + } + @Test public void testSampleStats() throws Exception { + clearSampleVariantStats(); sampleVariantStats("1,2", "stats_1", false, 1, file.getSampleIds().subList(0, 2)); sampleVariantStats("1,2", "stats_1", false, 1, file.getSampleIds().subList(2, 4)); sampleVariantStats("1,2", "stats_2", false, 2, Collections.singletonList(ParamConstants.ALL)); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/adaptors/phoenix/PhoenixHelper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/adaptors/phoenix/PhoenixHelper.java index a7a87cad4e7..b702ba1a89e 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/adaptors/phoenix/PhoenixHelper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/adaptors/phoenix/PhoenixHelper.java @@ -36,10 +36,7 @@ import org.apache.phoenix.schema.PTable; import org.apache.phoenix.schema.PTableType; import org.apache.phoenix.schema.TableNotFoundException; -import org.apache.phoenix.schema.types.PArrayDataType; -import org.apache.phoenix.schema.types.PDataType; -import org.apache.phoenix.schema.types.PInteger; -import org.apache.phoenix.schema.types.PhoenixArray; +import org.apache.phoenix.schema.types.*; import org.apache.phoenix.util.*; import org.opencb.opencga.core.common.BatchUtils; import org.opencb.opencga.core.common.ExceptionUtils; @@ -51,8 +48,6 @@ import org.slf4j.LoggerFactory; import java.io.IOException; -import java.lang.reflect.InvocationTargetException; -import java.lang.reflect.Method; import java.sql.Connection; import java.sql.ResultSet; import java.sql.SQLException; @@ -75,37 +70,14 @@ public class PhoenixHelper { private final Configuration conf; private static Logger logger = LoggerFactory.getLogger(PhoenixHelper.class); - private static Method positionAtArrayElement; private TableName systemCatalog; public PhoenixHelper(Configuration conf) { this.conf = conf; } - static { - Class decoder; - try { - decoder = Class.forName("org.apache.phoenix.schema.types.PArrayDataTypeDecoder"); - } catch (ClassNotFoundException e) { - decoder = PArrayDataType.class; - } - try { - positionAtArrayElement = decoder.getMethod("positionAtArrayElement", - ImmutableBytesWritable.class, Integer.TYPE, PDataType.class, Integer.class); - } catch (NoSuchMethodException e) { - // This should never happen! - throw new RuntimeException(e); - } - } - public static boolean positionAtArrayElement(ImmutableBytesWritable ptr, int arrayIndex, PDataType pDataType, Integer byteSize) { -// return PArrayDataTypeDecoder.positionAtArrayElement(ptr, arrayIndex, instance, byteSize); - try { - Object o = positionAtArrayElement.invoke(null, ptr, arrayIndex, pDataType, byteSize); - return o == null || (boolean) o; - } catch (IllegalAccessException | InvocationTargetException e) { - throw new RuntimeException(e); - } + return PArrayDataTypeDecoder.positionAtArrayElement(ptr, arrayIndex, pDataType, byteSize); } public boolean execute(Connection con, String sql) throws SQLException { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/stats/SampleVariantStatsHBaseMapReduceAnalysisExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/stats/SampleVariantStatsHBaseMapReduceAnalysisExecutor.java index 8b4b0175356..bba1d238731 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/stats/SampleVariantStatsHBaseMapReduceAnalysisExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/analysis/stats/SampleVariantStatsHBaseMapReduceAnalysisExecutor.java @@ -6,8 +6,9 @@ import org.opencb.opencga.core.exceptions.ToolExecutorException; import org.opencb.opencga.core.tools.annotations.ToolExecutor; import org.opencb.opencga.core.tools.variant.SampleVariantStatsAnalysisExecutor; +import org.opencb.opencga.storage.core.variant.adaptors.VariantQuery; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryException; -import org.opencb.opencga.storage.core.variant.query.ParsedVariantQuery; +import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.analysis.HadoopVariantStorageToolExecutor; @@ -45,9 +46,13 @@ public void run() throws ToolException { } } - ParsedVariantQuery variantQuery = engine.parseQuery(getVariantQuery(), new QueryOptions()); + VariantQuery query = engine.parseQuery(getVariantQuery(), new QueryOptions()).getQuery(); + // SampleData and FileData filters should not include the sample or file names. + // The parser would add them. Restore the original query values (if any) + query.putIfNotNull(VariantQueryParam.SAMPLE_DATA.key(), getVariantQuery().get(VariantQueryParam.SAMPLE_DATA.key())); + query.putIfNotNull(VariantQueryParam.FILE_DATA.key(), getVariantQuery().get(VariantQueryParam.FILE_DATA.key())); ObjectMap params = new ObjectMap(engine.getOptions()) - .appendAll(variantQuery.getQuery()) + .appendAll(query) .append(SampleVariantStatsDriver.SAMPLES, sampleNames) .append(SampleVariantStatsDriver.OUTPUT, getOutputFile().toAbsolutePath().toUri()); engine.getMRExecutor().run(SampleVariantStatsDriver.class, SampleVariantStatsDriver.buildArgs( diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/filters/VariantRowFilterFactory.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/filters/VariantRowFilterFactory.java index d7d3a09a07f..d3007764651 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/filters/VariantRowFilterFactory.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/filters/VariantRowFilterFactory.java @@ -145,6 +145,9 @@ public Predicate buildSampleDataFilter(String sampleDat final Predicate predicate; int idx = fixedFormat.indexOf(filter.getKey()); + if (idx < 0) { + throw new IllegalArgumentException("Unknown key '" + filter.getKey() + "'. Supported keys are: " + fixedFormat); + } String filterValue = filter.getValue(); if (StringUtils.isNumeric(filterValue)) { // Numeric value diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java index 2a94116882e..abc43465322 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsDriver.java @@ -95,16 +95,17 @@ protected void parseAndValidateParameters() throws IOException { List samples = Arrays.asList(samplesStr.split(",")); StringBuilder trios = new StringBuilder(); + int triosCount = 0; includeSample = new LinkedHashSet<>(); if (samples.size() == 1 && (samples.get(0).equals("auto") || samples.get(0).equals("all"))) { boolean all = samples.get(0).equals("all"); - metadataManager.sampleMetadataIterator(studyId).forEachRemaining(sampleMetadata -> { + for (SampleMetadata sampleMetadata : metadataManager.sampleMetadataIterable(studyId)) { if (sampleMetadata.isIndexed()) { if (all || sampleMetadata.getStats() == null || MapUtils.isEmpty(sampleMetadata.getStats().getBiotypeCount())) { - addTrio(trios, includeSample, sampleMetadata); + triosCount += addTrio(trios, includeSample, sampleMetadata); } } - }); + } sampleIds = new ArrayList<>(includeSample); } else { sampleIds = new ArrayList<>(samples.size()); @@ -114,12 +115,15 @@ protected void parseAndValidateParameters() throws IOException { throw VariantQueryException.sampleNotFound(sample, metadataManager.getStudyName(studyId)); } sampleIds.add(sampleId); - addTrio(trios, includeSample, metadataManager.getSampleMetadata(studyId, sampleId)); + triosCount += addTrio(trios, includeSample, metadataManager.getSampleMetadata(studyId, sampleId)); } } if (sampleIds.isEmpty()) { throw new IllegalArgumentException("Nothing to do!"); } + LOGGER.info(" * samples : " + (samples.size() > 10 ? (samples.subList(0, 10) + "...") : samples) + " (" + samples.size() + ")"); + LOGGER.info(" * includeSamples : " + includeSample.size()); + LOGGER.info(" * familyTrios : " + triosCount); fileData = getParam(VariantQueryParam.FILE_DATA.key()); if (StringUtils.isNotEmpty(fileData)) { LOGGER.info(" * fileData : " + fileData); @@ -133,7 +137,7 @@ protected void parseAndValidateParameters() throws IOException { } - private void addTrio(StringBuilder trios, Set includeSample, SampleMetadata sampleMetadata) { + private int addTrio(StringBuilder trios, Set includeSample, SampleMetadata sampleMetadata) { includeSample.add(sampleMetadata.getId()); if (sampleMetadata.getFather() != null || sampleMetadata.getMother() != null) { // Make sure parents are included in the query @@ -149,7 +153,9 @@ private void addTrio(StringBuilder trios, Set includeSample, SampleMeta .append(",") .append(sampleMetadata.getMother() == null ? "0" : sampleMetadata.getMother()) .append(";"); + return 1; } + return 0; } private static Pedigree readPedigree(Configuration conf) { @@ -340,11 +346,13 @@ public void merge(SampleVariantStatsWritable other) { public static class DistributedSampleVariantStatsCalculator extends SampleVariantStatsCalculator { - private List sampleIds; + private Set sampleIds; + private List includeSampleIds; - public DistributedSampleVariantStatsCalculator(Pedigree pedigree, List samples) { - super(pedigree, samples.stream().map(String::valueOf).collect(Collectors.toList())); + public DistributedSampleVariantStatsCalculator(Pedigree pedigree, Set samples, List includeSamples) { + super(pedigree, includeSamples.stream().map(String::valueOf).collect(Collectors.toList())); sampleIds = samples; + includeSampleIds = includeSamples; } public DistributedSampleVariantStatsCalculator(SampleVariantStatsWritable statsWritable) { @@ -361,8 +369,13 @@ public DistributedSampleVariantStatsCalculator(SampleVariantStatsWritable statsW public List getWritables() { List writables = new ArrayList<>(statsList.size()); for (int i = 0; i < statsList.size(); i++) { - writables.add(new SampleVariantStatsWritable( - sampleIds.get(i), ti[i], tv[i], qualCount[i], qualSum[i], qualSumSq[i], statsList.get(i))); + Integer sampleId = includeSampleIds.get(i); + if (sampleIds.contains(sampleId)) { + // Only write samples that were requested + // Skip samples that were included but not requested, as they are used for calculating other stats + writables.add(new SampleVariantStatsWritable( + sampleId, ti[i], tv[i], qualCount[i], qualSum[i], qualSumSq[i], statsList.get(i))); + } } return writables; } @@ -412,7 +425,7 @@ protected void setup(Context context) throws IOException, InterruptedException { } Pedigree pedigree = readPedigree(context.getConfiguration()); - calculator = new DistributedSampleVariantStatsCalculator(pedigree, new ArrayList<>(samples)); + calculator = new DistributedSampleVariantStatsCalculator(pedigree, samples, new ArrayList<>(includeSamples)); calculator.pre(); fileDataFilter = filterFactory.buildFileDataFilter(fileDataQuery); From 3853c638c96dfc5e5ca5628cd1af595cfaae558e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 3 Dec 2024 09:14:19 +0000 Subject: [PATCH 61/66] app: Regenerate cli. #TASK-6722 --- .../AnalysisVariantCommandExecutor.java | 1 - .../AnalysisVariantCommandOptions.java | 3 - .../core/metadata/models/ProjectMetadata.java | 61 +++++++++++++++++++ .../hadoop/app/VariantMetadataMain.java | 58 +++++++++++++++++- 4 files changed, 118 insertions(+), 5 deletions(-) diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/AnalysisVariantCommandExecutor.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/AnalysisVariantCommandExecutor.java index 1410934a499..95d1bcf7b4e 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/AnalysisVariantCommandExecutor.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/AnalysisVariantCommandExecutor.java @@ -559,7 +559,6 @@ private RestResponse runExport() throws Exception { putNestedIfNotEmpty(beanParams, "unknownGenotype", commandOptions.unknownGenotype, true); putNestedIfNotNull(beanParams, "sampleMetadata", commandOptions.sampleMetadata, true); putNestedIfNotNull(beanParams, "sort", commandOptions.sort, true); - putNestedIfNotEmpty(beanParams, "outdir", commandOptions.outdir, true); putNestedIfNotEmpty(beanParams, "outputFileName", commandOptions.outputFileName, true); putNestedIfNotEmpty(beanParams, "outputFileFormat", commandOptions.outputFileFormat, true); putNestedIfNotEmpty(beanParams, "variantsFile", commandOptions.variantsFile, true); diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/AnalysisVariantCommandOptions.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/AnalysisVariantCommandOptions.java index 24a37e04229..491495faa8f 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/AnalysisVariantCommandOptions.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/AnalysisVariantCommandOptions.java @@ -723,9 +723,6 @@ public class RunExportCommandOptions { @Parameter(names = {"--sort"}, description = "The body web service sort parameter", required = false, help = true, arity = 0) public boolean sort = false; - @Parameter(names = {"--outdir"}, description = "The body web service outdir parameter", required = false, arity = 1) - public String outdir; - @Parameter(names = {"--output-file-name"}, description = "The body web service outputFileName parameter", required = false, arity = 1) public String outputFileName; diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/metadata/models/ProjectMetadata.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/metadata/models/ProjectMetadata.java index e64deedff64..fc8744528aa 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/metadata/models/ProjectMetadata.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/metadata/models/ProjectMetadata.java @@ -70,6 +70,24 @@ public VariantAnnotationSets setSaved(List saved) { this.saved = saved; return this; } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + VariantAnnotationSets that = (VariantAnnotationSets) o; + return Objects.equals(current, that.current) + && Objects.equals(saved, that.saved); + } + + @Override + public int hashCode() { + return Objects.hash(current, saved); + } } public static class VariantAnnotationMetadata { @@ -158,6 +176,28 @@ public VariantAnnotationMetadata setPrivateSources(List privateSources) this.privateSources = privateSources; return this; } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + VariantAnnotationMetadata that = (VariantAnnotationMetadata) o; + return id == that.id && Objects.equals(name, that.name) + && Objects.equals(creationDate, that.creationDate) + && Objects.equals(annotator, that.annotator) + && Objects.equals(sourceVersion, that.sourceVersion) + && Objects.equals(dataRelease, that.dataRelease) + && Objects.equals(privateSources, that.privateSources); + } + + @Override + public int hashCode() { + return Objects.hash(id, name, creationDate, annotator, sourceVersion, dataRelease, privateSources); + } } public static class VariantAnnotatorProgram { @@ -321,4 +361,25 @@ public ProjectMetadata setAttributes(ObjectMap attributes) { return this; } + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + ProjectMetadata that = (ProjectMetadata) o; + return release == that.release && Objects.equals(species, that.species) + && Objects.equals(assembly, that.assembly) + && Objects.equals(dataRelease, that.dataRelease) + && Objects.equals(annotation, that.annotation) + && Objects.equals(counters, that.counters) + && Objects.equals(attributes, that.attributes); + } + + @Override + public int hashCode() { + return Objects.hash(species, assembly, dataRelease, release, annotation, counters, attributes); + } } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/VariantMetadataMain.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/VariantMetadataMain.java index 202ab39f0b4..34fa7890263 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/VariantMetadataMain.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/VariantMetadataMain.java @@ -56,7 +56,10 @@ public VariantMetadataCommandExecutor() { public VariantMetadataCommandExecutor(String argsContext) { super(argsContext); addSubCommand(Arrays.asList("tables", "table"), "[help|list]", new HBaseTablesCommandExecutor()); - addSubCommand(Arrays.asList("study-metadata", "sm", "study", "studies"), "[help|list|id|read|write|rename] ..", + addSubCommand(Arrays.asList("project-metadata", "p", "pm", "project"), "[help|read|write|replace] ..", + new ProjectCommandExecutor()); + addSubCommand(Arrays.asList("study-metadata", "sm", "study", "studies"), "[help|list|id|read|write|replace|rename] " + + " ..", new StudyCommandExecutor()); addSubCommand(Arrays.asList("file-metadata", "fm", "file", "files"), "[help|list|id|read|write] ...", new FileCommandExecutor()); @@ -115,6 +118,37 @@ protected void cleanup(String command, String[] args) throws Exception { } } + private static class ProjectCommandExecutor extends VariantStorageMetadataManagerCommandExecutor { + ProjectCommandExecutor() { + addSubCommand(Arrays.asList("read", "info"), + "", + args -> { + print(mm.getProjectMetadata()); + } + ); + addSubCommand(Arrays.asList("write", "update"), + " ", + args -> { + ProjectMetadata projectMetadata = readFile(getArg(args, 1), ProjectMetadata.class); + mm.updateProjectMetadata(pm -> projectMetadata); + } + ); + addSubCommand(Arrays.asList("replace"), + " ", + args -> { + ProjectMetadata origProjectMetadata = readFile(getArg(args, 1), ProjectMetadata.class); + ProjectMetadata newProjectMetadata = readFile(getArg(args, 2), ProjectMetadata.class); + mm.updateProjectMetadata(pm -> { + if (!pm.equals(origProjectMetadata)) { + throw new IllegalStateException("Original ProjectMetadata does not match!"); + } + return newProjectMetadata; + }); + } + ); + } + } + private static class StudyCommandExecutor extends VariantStorageMetadataManagerCommandExecutor { StudyCommandExecutor() { addSubCommand(Arrays.asList("list", "search"), @@ -129,12 +163,34 @@ private static class StudyCommandExecutor extends VariantStorageMetadataManagerC print(mm.getStudyMetadata(getArg(args, 1))); } ); + addSubCommand(Arrays.asList("id"), + " ", + args -> { + print(mm.getStudyId(getArg(args, 1))); + } + ); addSubCommand(Arrays.asList("write", "update"), " ", args -> { mm.unsecureUpdateStudyMetadata(readFile(getArg(args, 1), StudyMetadata.class)); } ); + addSubCommand(Arrays.asList("replace"), + " ", + args -> { + StudyMetadata origStudyMetadata = readFile(getArg(args, 1), StudyMetadata.class); + StudyMetadata newStudyMetadata = readFile(getArg(args, 2), StudyMetadata.class); + if (origStudyMetadata.getId() != newStudyMetadata.getId()) { + throw new IllegalStateException("StudyMetadata IDs do not match!"); + } + mm.updateStudyMetadata(origStudyMetadata.getId(), sm -> { + if (!sm.equals(origStudyMetadata)) { + throw new IllegalStateException("Original StudyMetadata does not match!"); + } + return newStudyMetadata; + }); + }); + addSubCommand(Arrays.asList("rename"), " ", args -> { From eb61609e5e2829123e70104f49428186da7c2b88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 3 Dec 2024 22:21:40 +0000 Subject: [PATCH 62/66] storage: Fix junit tests. #TASK-6722 --- .../variant/mr/StreamVariantMapper.java | 11 ++++++--- .../analysis/gwas/FisherTestDriverTest.java | 24 +++++++------------ .../variant/stats/SampleVariantStatsTest.java | 14 ++++------- .../walker/HadoopVariantWalkerTest.java | 9 ++++++- .../test/resources/variantWalker/Dockerfile | 2 +- 5 files changed, 30 insertions(+), 30 deletions(-) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java index 89758279e05..f4b389bd692 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/mr/StreamVariantMapper.java @@ -44,6 +44,7 @@ public class StreamVariantMapper extends VariantMapper { public static final String COMMANDLINE_BASE64 = "opencga.variant.stream.commandline_base64"; public static final String ADDENVIRONMENT_PARAM = "opencga.variant.stream.addenvironment"; public static final String HAS_REDUCE = "opencga.variant.stream.hasReduce"; + public static final String DOCKER_PRUNE_OPTS = "opencga.variant.stream.docker.prune.opts"; private final boolean verboseStdout = false; private static final long REPORTER_OUT_DELAY = 10 * 1000L; @@ -269,16 +270,20 @@ private void throwExceptionIfAny() throws IOException { @Override protected void cleanup(Mapper.Context context) throws IOException, InterruptedException { closeProcess(context, true); - dockerPruneImages(); + dockerPruneImages(context.getConfiguration()); super.cleanup(context); } - private void dockerPruneImages() { + private void dockerPruneImages(Configuration conf) { try { LOG.info("Pruning docker images"); int maxImages = 5; + + + String dockerPruneOpts = conf.get(DOCKER_PRUNE_OPTS, ""); + Command command = new Command(new String[]{"bash", "-c", "[ $(docker image ls --format json | wc -l) -gt " + maxImages + " ] " - + "&& echo 'Run docker image prune' && docker image prune -f --all --filter label!=storage='do_not_delete'" + + "&& echo 'Run docker image prune' && docker image prune -f --all " + dockerPruneOpts + "|| echo 'Skipping docker image prune. Less than " + maxImages + " images.'"}, Collections.emptyMap()); command.run(); int ecode = command.getExitValue(); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/analysis/gwas/FisherTestDriverTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/analysis/gwas/FisherTestDriverTest.java index 0052e805798..2a352502b9a 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/analysis/gwas/FisherTestDriverTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/analysis/gwas/FisherTestDriverTest.java @@ -20,15 +20,16 @@ import org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; -import java.io.*; +import java.io.BufferedReader; +import java.io.DataInputStream; +import java.io.FileInputStream; +import java.io.InputStreamReader; import java.net.URI; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import java.util.stream.IntStream; -import java.util.stream.Stream; @Category(LongTests.class) public class FisherTestDriverTest extends VariantStorageBaseTest implements HadoopVariantStorageTest { @@ -78,18 +79,17 @@ public void testFisher() throws Exception { .map(s -> metadataManager.getSampleName(studyMetadata.getId(), s)) .collect(Collectors.toList()); + URI local1 = localOut.resolve("fisher_result.tsv"); ObjectMap objectMap = new ObjectMap() .append(FisherTestDriver.CONTROL_COHORT, controlCohort) .append(FisherTestDriver.CASE_COHORT, caseCohort) - .append(FisherTestDriver.OUTPUT, "fisher_result"); + .append(FisherTestDriver.OUTPUT, local1); getMrExecutor().run(FisherTestDriver.class, FisherTestDriver.buildArgs( dbAdaptor.getArchiveTableName(1), dbAdaptor.getVariantTable(), 1, Collections.emptySet(), objectMap), ""); - URI local1 = copyToLocal("fisher_result"); - URI local2 = localOut.resolve("fisher_result2.tsv"); objectMap.append(FisherTestDriver.OUTPUT, local2) .append(VariantQueryParam.ANNOT_CONSEQUENCE_TYPE.key(), "lof,missense_variant") @@ -100,15 +100,14 @@ public void testFisher() throws Exception { 1, Collections.emptySet(), objectMap), ""); -// URI local2 = copyToLocal("fisher_result2"); variantStorageEngine.loadVariantScore(local1, studyMetadata.getName(), "fisher1", "ALL", null, new VariantScoreFormatDescriptor(1, 16, 15), new ObjectMap()); variantStorageEngine.loadVariantScore(local2, studyMetadata.getName(), "fisher2", "ALL", null, new VariantScoreFormatDescriptor(1, 16, 15), new ObjectMap()); - FileSystem fs = FileSystem.get(configuration.get()); + FileSystem fs = FileSystem.get(local1, configuration.get()); Set lines1 = new HashSet<>(); int lines2 = 0; - try (BufferedReader is = new BufferedReader(new InputStreamReader(fs.open(new Path("fisher_result/part-r-00000"))))) { + try (BufferedReader is = new BufferedReader(new InputStreamReader(fs.open(new Path(local1))))) { String x = is.readLine(); while (StringUtils.isNotEmpty(x)) { // System.out.println(x); @@ -134,11 +133,4 @@ public void testFisher() throws Exception { Assert.assertThat(lines2, VariantMatchers.gt(0)); } - private URI copyToLocal(String s) throws IOException { - FileSystem fs = FileSystem.get(configuration.get()); - URI local = localOut.resolve(s + ".tsv"); - fs.copyToLocalFile(new Path(s + "/part-r-00000"), new Path(local)); - return local; - } - } \ No newline at end of file diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsTest.java index 5a3a0a1ea54..e5f9417c308 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/stats/SampleVariantStatsTest.java @@ -1,6 +1,5 @@ package org.opencb.opencga.storage.hadoop.variant.stats; -import org.apache.commons.io.FileUtils; import org.junit.*; import org.junit.experimental.categories.Category; import org.junit.rules.ExternalResource; @@ -23,8 +22,8 @@ import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageTest; import org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils; -import java.io.File; import java.net.URI; +import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; @@ -105,7 +104,7 @@ public void testAuto() throws Exception { Integer childId = engine.getMetadataManager().getSampleId(studyId, child); engine.getMetadataManager().updateSampleMetadata(studyId, childId, sampleMetadata -> sampleMetadata.setStats(stats.get(2))); - URI localOutputUri = newOutputUri(); + URI localOutputUri = newOutputUri().resolve("stats.json"); ObjectMap params = new ObjectMap().append(SampleVariantStatsDriver.SAMPLES, "auto") .append(SampleVariantStatsDriver.OUTPUT, localOutputUri); getMrExecutor().run(SampleVariantStatsDriver.class, SampleVariantStatsDriver.buildArgs(null, engine.getVariantTableName(), 1, null, params), ""); @@ -115,9 +114,7 @@ public void testAuto() throws Exception { Assert.assertEquals(stats, actualStats); - List files = new ArrayList<>(FileUtils.listFiles(new File(localOutputUri), null, true)); - Assert.assertEquals(1, files.size()); - List statsFromFile = JacksonUtils.getDefaultObjectMapper().readerFor(SampleVariantStats.class).readValues(files.get(0)).readAll(); + List statsFromFile = JacksonUtils.getDefaultObjectMapper().readerFor(SampleVariantStats.class).readValues(Paths.get(localOutputUri).toFile()).readAll(); Map statsFromFileMap = statsFromFile.stream().collect(Collectors.toMap(SampleVariantStats::getId, i -> i)); Assert.assertEquals(stats.get(0), statsFromFileMap.get(father)); Assert.assertEquals(stats.get(1), statsFromFileMap.get(mother)); @@ -151,9 +148,8 @@ public void testChild() throws Exception { List actualStats = readStatsFromMeta(); - // When processing a child, its parents must be processed as well - Assert.assertEquals(3, actualStats.size()); - Assert.assertEquals(stats, actualStats); + Assert.assertEquals(1, actualStats.size()); + Assert.assertEquals(stats.get(2), actualStats.get(0)); } public List readStatsFromMeta() throws StorageEngineException { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/walker/HadoopVariantWalkerTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/walker/HadoopVariantWalkerTest.java index 0d9ab975ed7..b3938e61758 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/walker/HadoopVariantWalkerTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/walker/HadoopVariantWalkerTest.java @@ -15,6 +15,7 @@ import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine; import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageTest; import org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils; +import org.opencb.opencga.storage.hadoop.variant.mr.StreamVariantMapper; import java.io.IOException; import java.net.URI; @@ -109,8 +110,14 @@ public void exportDocker() throws Exception { URI outdir = newOutputUri(); String cmdPython1 = "python variant_walker.py walker_example Cut --length 30"; - + variantStorageEngine.getOptions().put(StreamVariantMapper.DOCKER_PRUNE_OPTS, " --filter label!=opencga_scope='test'"); variantStorageEngine.walkData(outdir.resolve("variant4.txt.gz"), VariantWriterFactory.VariantOutputFormat.JSON, new Query(), new QueryOptions(), dockerImage, cmdPython1); + + // Ensure that the docker image is not pruned + Command dockerImages = new Command(new String[]{"docker", "images", "--filter", "label=opencga_scope=test"}, Collections.emptyMap()); + dockerImages.run(); + assertEquals(0, dockerImages.getExitValue()); + assertEquals(2, dockerImages.getOutput().split("\n").length); } private static String buildDocker() throws IOException { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/variantWalker/Dockerfile b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/variantWalker/Dockerfile index bd9f5511adf..ca17155b91c 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/variantWalker/Dockerfile +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/resources/variantWalker/Dockerfile @@ -6,7 +6,7 @@ WORKDIR /app ARG PYTHON_PATH="." -LABEL storage="do_not_delete" +LABEL opencga_scope="test" RUN echo ${PYTHON_PATH} # Copy the python directory contents into the container at /app From 54acc28d9a8fe0164d03fc116e27b0cc37ddc2ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 4 Dec 2024 09:58:56 +0000 Subject: [PATCH 63/66] cicd: Increase "Publish Test Report on GitHub" memory #TASK-6722 --- .github/workflows/test-analysis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-analysis.yml b/.github/workflows/test-analysis.yml index 1962c89ddc7..e8dec69c79d 100644 --- a/.github/workflows/test-analysis.yml +++ b/.github/workflows/test-analysis.yml @@ -108,7 +108,7 @@ jobs: - name: Publish Test Report on GitHub uses: scacap/action-surefire-report@v1 env: - NODE_OPTIONS: '--max_old_space_size=4096' + NODE_OPTIONS: '--max_old_space_size=6144' ## Skip cancelled() ## https://docs.github.com/en/actions/learn-github-actions/expressions#cancelled if: success() || failure() From 4e96492c592c70fca06823908537071c6f05b4e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 4 Dec 2024 09:59:10 +0000 Subject: [PATCH 64/66] core: Fix NumberFormatException from IOUtils. #TASK-6722 --- .../main/java/org/opencb/opencga/core/common/IOUtils.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java b/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java index bfc9f3524ba..2446cc8956c 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java @@ -28,7 +28,6 @@ import java.util.LinkedList; import java.util.List; import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.BlockingQueue; import java.util.concurrent.ExecutionException; import java.util.concurrent.atomic.AtomicReference; import java.util.regex.Pattern; @@ -399,12 +398,12 @@ public static long fromHumanReadableToByte(String value, boolean assumeBinary) { value = value.substring(0, value.length() - 1); } final boolean si; - if (assumeBinary) { + if (value.endsWith("i")) { si = false; + value = value.substring(0, value.length() - 1); } else { - if (value.endsWith("i")) { + if (assumeBinary) { si = false; - value = value.substring(0, value.length() - 1); } else { si = true; } From 852ffcae7fe8ce913590a2ad278d3a77d29b7a5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 19 Dec 2024 10:58:10 +0000 Subject: [PATCH 65/66] core: Remove unused method. #TASK-6722 --- .../opencb/opencga/core/common/IOUtils.java | 47 ------------------- 1 file changed, 47 deletions(-) diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java b/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java index 2446cc8956c..e5cf6fbf4b4 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java @@ -16,8 +16,6 @@ package org.opencb.opencga.core.common; -import org.opencb.commons.run.ParallelTaskRunner; - import java.io.*; import java.nio.ByteBuffer; import java.nio.charset.Charset; @@ -28,7 +26,6 @@ import java.util.LinkedList; import java.util.List; import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.ExecutionException; import java.util.concurrent.atomic.AtomicReference; import java.util.regex.Pattern; import java.util.zip.ZipEntry; @@ -477,48 +474,4 @@ public static void copyBytesParallel(InputStream is, OutputStream os, int buffer throw new IOException(exception.get()); } } - - public static void copyBytesParallel2(InputStream is, OutputStream os, int bufferSize) throws IOException { - - List buffersPool = Collections.synchronizedList(new LinkedList<>()); - ParallelTaskRunner.Config config = ParallelTaskRunner.Config.builder() - .setNumTasks(1) - .setCapacity(5) - .setSorted(true) - .build(); - ParallelTaskRunner runner = new ParallelTaskRunner<>(batchSize -> { - try { - ByteBuffer buf = buffersPool.isEmpty() ? ByteBuffer.allocate(bufferSize) : buffersPool.remove(0); - int bytesRead = is.read(buf.array()); - if (bytesRead > 0) { - if (bytesRead != buf.array().length) { - buf.limit(bytesRead); - buf.rewind(); - } - return Collections.singletonList(buf); - } else { - return Collections.emptyList(); - } - } catch (IOException e) { - throw new UncheckedIOException(e); - } - }, t -> t, batch -> { - try { - for (ByteBuffer buf : batch) { - os.write(buf.array(), 0, buf.limit()); - // Return the buffer to the pool - buf.clear(); - buffersPool.add(buf); - } - } catch (IOException e1) { - throw new UncheckedIOException(e1); - } - return true; - }, config); - try { - runner.run(); - } catch (ExecutionException e) { - throw new IOException(e); - } - } } From 005855f4306c1213481ec52ebed90c7651ee1304 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Thu, 19 Dec 2024 17:23:07 +0000 Subject: [PATCH 66/66] storage: Do not add new abstract methods to VariantStorageEngine. #TASK-6722 --- .../opencga/storage/core/variant/VariantStorageEngine.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java index 81ddc4c0e3d..e559592aa28 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java @@ -326,9 +326,11 @@ public List walkData(URI outputFile, VariantWriterFactory.VariantOutputForm } - public abstract List walkData(URI outputFile, VariantOutputFormat format, Query query, QueryOptions queryOptions, + public List walkData(URI outputFile, VariantOutputFormat format, Query query, QueryOptions queryOptions, String commandLine) - throws StorageEngineException; + throws StorageEngineException { + throw new UnsupportedOperationException(); + } /** * Creates a new {@link VariantExporter} for the current backend.