diff --git a/hpg-bigdata-analysis/pom.xml b/hpg-bigdata-analysis/pom.xml index 768cb6d4..f43f97be 100644 --- a/hpg-bigdata-analysis/pom.xml +++ b/hpg-bigdata-analysis/pom.xml @@ -84,6 +84,11 @@ junit junit + + com.beust + jcommander + 1.58 + \ No newline at end of file diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/AnalysisExecutor.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/AnalysisExecutor.java index 9444df34..8ec3aafd 100644 --- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/AnalysisExecutor.java +++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/AnalysisExecutor.java @@ -4,15 +4,19 @@ * Created by jtarraga on 30/01/17. */ public abstract class AnalysisExecutor { - protected String datasetName; + protected String studyId; - public String getDatasetName() { - return datasetName; + protected AnalysisExecutor(String studyId) { + this.studyId = studyId; } - public void setDatasetName(String datasetName) { - this.datasetName = datasetName; + protected String studyId() { + return studyId; } - public abstract void execute() throws AnalysisExecutorException; + protected void setStudyId(String studyId) { + this.studyId = studyId; + } + + protected abstract void execute() throws AnalysisExecutorException; } diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/FilterParameters.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/FilterParameters.java new file mode 100644 index 00000000..bdbb7e66 --- /dev/null +++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/FilterParameters.java @@ -0,0 +1,105 @@ +package org.opencb.hpg.bigdata.analysis.variant; + +import com.beust.jcommander.Parameter; + +public class FilterParameters { + @Parameter(names = {"--id"}, description = "Query for ID; comma separated list of IDs, e.g.:" + + " \"rs312411,rs421225\"", arity = 1) + public String ids; + + @Parameter(names = {"--id-file"}, description = "Query for ID that are stored in a file, one ID per line," + + " e.g.: rs312411", arity = 1) + public String idFilename; + + @Parameter(names = {"--type"}, description = "Query for type; comma separated list of IDs, e.g.:" + + " \"INDEL,SNP,SNV\"", arity = 1) + public String types; + + @Parameter(names = {"--s", "--study"}, description = "Query for study; comma separated list of study names", + arity = 1) + public String studies; + + @Parameter(names = {"--biotype"}, description = "Query for biotype; comma separated list of biotype names," + + " e.g.: protein_coding, pseudogene", arity = 1) + public String biotypes; + + @Parameter(names = {"-r", "--region"}, description = "Query for region; comma separated list of regions," + + " e.g.: 1:300000-400000000,15:343453463-8787665654", arity = 1) + public String regions; + + @Parameter(names = {"--region-file"}, description = "Query for regions that are stored in a file, one region" + + " per line, e.g.: 1:6700000-560000000", arity = 1) + public String regionFilename; + + @Parameter(names = {"--maf"}, description = "Query for the Minor Allele Frequency of a given study and" + + " cohort. Use the following format enclosed with double quotes: \"study_name::cohort_name" + + "[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all>0.4\"", arity = 1) + public String maf; + + @Parameter(names = {"--mgf"}, description = "Query for the Minor Genotype Frequency of a given study and" + + " cohort. Use the following format enclosed with double quotes: \"study_name::cohort_name" + + "[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all>0.18198\"", arity = 1) + public String mgf; + + @Parameter(names = {"--missing-allele"}, description = "Query for the number of missing alleles of a given" + + " study and cohort. Use the following format enclosed with double quotes: \"study_name::cohort_name" + + "[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all==5\"", arity = 1) + public String missingAlleles; + + @Parameter(names = {"--missing-genotype"}, description = "Query for the number of missing genotypes of a" + + " given study and cohort. Use the following format enclosed with double quotes: \"study_name::" + + "cohort_name[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all!=0\"", arity = 1) + public String missingGenotypes; + + @Parameter(names = {"--ct", "--consequence-type"}, description = "Query for Sequence Ontology term names or" + + " accession codes; comma separated (use double quotes if you provide term names), e.g.:" + + " \"transgenic insertion,SO:32234,SO:00124\"", arity = 1) + public String consequenceTypes; + + @Parameter(names = {"--gene"}, description = "Query for gene; comma separated list of gene names, e.g.:" + + " \"BIN3,ZNF517\"", arity = 1) + public String genes; + + @Parameter(names = {"--clinvar"}, description = "Query for clinvar (accession); comma separated list of" + + " accessions", arity = 1) + public String clinvar; + + @Parameter(names = {"--cosmic"}, description = "Query for cosmic (mutation ID); comma separated list of" + + " mutations IDs", arity = 1) + public String cosmic; + +// @Parameter(names = {"--gwas"}, description = "Query for gwas (traits); comma separated list of traits", +// arity = 1) +// public String gwas; + + @Parameter(names = {"--conservation"}, description = "Query for conservation scores (phastCons, phylop, gerp);" + + "comma separated list of scores and enclosed with double quotes, e.g.: \"phylop<0.3,phastCons<0.1\"", + arity = 1) + public String conservScores; + + @Parameter(names = {"--ps", "--protein-substitution"}, description = "Query for protein substitution scores" + + " (polyphen, sift); comma separated list of scores and enclosed with double quotes, e.g.:" + + "\"polyphen>0.3,sift>0.6\"", arity = 1) + public String substScores; + + @Parameter(names = {"--pf", "--population-frequency"}, description = "Query for alternate population" + + " frequency of a given study. Use the following format enclosed with double quotes:" + + " \"study_name::population_name[<|>|<=|>=|==|!=]frequency_value\", e.g.: \"1000g::CEU<0.4\"", + arity = 1) + public String pf; + + @Parameter(names = {"--pmaf", "--population-maf"}, description = "Query for population minor allele frequency" + + " of a given study. Use the following the format enclosed with double quotes: \"study_name::" + + "population_name[<|>|<=|>=|==|!=]frequency_value\", e.g.: \"1000g::PJL<=0.25\"", arity = 1) + public String pmaf; + + @Parameter(names = {"--sample-genotype"}, description = "Query for sample genotypes. Use the following the" + + " format enclosed with double quotes: \"sample_name1:genotype1,genotype;sample_name2:genotype1\"," + + " e.g.: \"HG00112:0/0;HG23412:1/0,1/1\"", arity = 1) + public String sampleGenotypes; + + @Parameter(names = {"--sample-filter"}, description = "Query for sample filter, i.e.: individual attributes (family, father," + + " mother, sex and phenotype) and user-defined attributes from pedigree information," + + " e.g.: \"individual.sex=MALE;Eyes=Blue\"", arity = 1) + public String sampleFilters; +} diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LinearRegressionAnalysis.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LinearRegressionAnalysis.java index b83b5f89..ca5ca534 100644 --- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LinearRegressionAnalysis.java +++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LinearRegressionAnalysis.java @@ -63,14 +63,13 @@ public void execute() { System.out.println("r2: " + trainingSummary.r2()); } - public LinearRegressionAnalysis(String datasetName, String studyName, String depVarName, String indepVarName) { - this(datasetName, studyName, depVarName, indepVarName, 10, 0.3, 0.8); + public LinearRegressionAnalysis(String studyId, String depVarName, String indepVarName) { + this(studyId, depVarName, indepVarName, 10, 0.3, 0.8); } - public LinearRegressionAnalysis(String datasetName, String studyName, String depVarName, String indepVarName, + public LinearRegressionAnalysis(String studyId, String depVarName, String indepVarName, int numIterations, double regularization, double elasticNet) { - this.datasetName = datasetName; - this.studyName = studyName; + super(studyId); this.depVarName = depVarName; this.indepVarName = indepVarName; this.numIterations = numIterations; diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LogisticRegressionAnalysis.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LogisticRegressionAnalysis.java index 4b58c3e3..d02a20e7 100644 --- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LogisticRegressionAnalysis.java +++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LogisticRegressionAnalysis.java @@ -70,14 +70,13 @@ public void execute() { lrModel.setThreshold(bestThreshold); } - public LogisticRegressionAnalysis(String datasetName, String studyName, String depVarName, String indepVarName) { - this(datasetName, studyName, depVarName, indepVarName, 10, 0.3, 0.8); + public LogisticRegressionAnalysis(String studyId, String depVarName, String indepVarName) { + this(studyId, depVarName, indepVarName, 10, 0.3, 0.8); } - public LogisticRegressionAnalysis(String datasetName, String studyName, String depVarName, String indepVarName, + public LogisticRegressionAnalysis(String studyId, String depVarName, String indepVarName, int numIterations, double regularization, double elasticNet) { - this.datasetName = datasetName; - this.studyName = studyName; + super(studyId); this.depVarName = depVarName; this.indepVarName = indepVarName; this.numIterations = numIterations; diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/PCAAnalysis.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/PCAAnalysis.java index 52deb02e..af33e6c9 100644 --- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/PCAAnalysis.java +++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/PCAAnalysis.java @@ -29,13 +29,12 @@ public void execute() { result.show(false); } - public PCAAnalysis(String datasetName, String studyName, String featureName) { - this(datasetName, studyName, featureName, 3); + public PCAAnalysis(String studyId, String studyName, String featureName) { + this(studyId, studyName, featureName, 3); } - public PCAAnalysis(String datasetName, String studyName, String featureName, int kValue) { - this.datasetName = datasetName; - this.studyName = studyName; + public PCAAnalysis(String studyId, String studyName, String featureName, int kValue) { + super(studyId); this.featureName = featureName; this.kValue = kValue; } diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/RvTestsAdaptor.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/RvTestsAdaptor.java deleted file mode 100644 index df9d9f07..00000000 --- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/RvTestsAdaptor.java +++ /dev/null @@ -1,468 +0,0 @@ -package org.opencb.hpg.bigdata.analysis.variant; - -import org.apache.spark.SparkConf; -import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.opencb.biodata.formats.pedigree.PedigreeManager; -import org.opencb.biodata.models.core.Region; -import org.opencb.biodata.models.core.pedigree.Pedigree; -import org.opencb.biodata.tools.variant.metadata.VariantMetadataManager; -import org.opencb.commons.utils.FileUtils; -import org.opencb.hpg.bigdata.analysis.AnalysisExecutor; -import org.opencb.hpg.bigdata.analysis.AnalysisExecutorException; -import org.opencb.hpg.bigdata.core.lib.SparkConfCreator; -import org.opencb.hpg.bigdata.core.lib.VariantDataset; -import scala.Tuple2; -import scala.collection.mutable.StringBuilder; -import scala.collection.mutable.WrappedArray; - -import java.io.*; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.*; - -/** - * Created by joaquin on 1/19/17. - */ -public class RvTestsAdaptor extends AnalysisExecutor implements Serializable { - private String inFilename; - private String metaFilename; - private String outDirname; - private String confFilename; - - private final String RVTEST_BIN = "rvtests"; //"/home/jtarraga/soft/rvtests/executable/rvtest"; - private final String BGZIP_BIN = "bgzip"; //"/home/joaquin/softs/htslib/bgzip"; - private final String TABIX_BIN = "tabix"; //"/home/joaquin/softs/htslib/tabix"; - - public RvTestsAdaptor(String inFilename, String outDirname, String confFilename) { - this(inFilename, inFilename + ".meta.json", outDirname, confFilename); - } - - public RvTestsAdaptor(String inFilename, String metaFilename, String outDirname, String confFilename) { - this.inFilename = inFilename; - this.metaFilename = metaFilename; - this.outDirname = outDirname; - this.confFilename = confFilename; - } - -// ./build/bin/hpg-bigdata-local2.sh variant rvtests -i ~/data/vcf/skat/example.vcf.avro -o ~/data/vcf/skat/out -// --dataset noname -c ~/data/vcf/skat/skat.params - - public void run(String datasetName) throws Exception { - // create spark session - SparkConf sparkConf; - if (inFilename.startsWith("/")) { - sparkConf = SparkConfCreator.getConf("variant rvtests", "local", 1, true); - } else { - sparkConf = new SparkConf().setAppName("variant rvtests"); - } - SparkSession sparkSession = new SparkSession(new SparkContext(sparkConf)); - - // load dataset - VariantDataset vd = new VariantDataset(sparkSession); - vd.load(inFilename); - vd.createOrReplaceTempView("vcf"); - - // load rvtests parameters into properties - Properties props = new Properties(); - InputStream confStream = new FileInputStream(confFilename); - props.load(confStream); - confStream.close(); - - for (Object key: props.keySet()) { - System.out.println((String) key + " = " + (String) props.get(key)); - } - - List kernels = null; - String kernel = props.getProperty("kernel"); - if (kernel != null) { - kernels = Arrays.asList(kernel.toLowerCase() - .replace("skato", "SkatO") - .replace("skat", "Skat") - .split(",")); - } - - // create temporary directory - File tmpDir = new File(outDirname + "/tmp"); - tmpDir.mkdir(); - - // create temporary file for --pheno - File phenoFile = new File(tmpDir.getAbsolutePath() + "/pheno"); - VariantMetadataManager metadataManager = new VariantMetadataManager(); - metadataManager.load(Paths.get(metaFilename)); - Pedigree pedigree = metadataManager.getPedigree(datasetName); - new PedigreeManager().save(pedigree, phenoFile.toPath()); - - // loop for regions - String line; - BufferedWriter writer = null; - BufferedReader reader = FileUtils.newBufferedReader(Paths.get(props.getProperty("setFile"))); - int i = 0; - StringBuilder sb = new StringBuilder(); - while ((line = reader.readLine()) != null) { - String[] fields = line.split("[\t ]"); - System.out.println(fields[0]); - Region region = new Region(fields[1]); - - // create temporary files for --inVcf and --setFile - File setFile = new File(tmpDir.getAbsolutePath() + "/setFile." + i); - writer = FileUtils.newBufferedWriter(setFile.toPath()); - writer.write(fields[0] + "\t" + fields[1] + "\n"); - writer.close(); - - // create temporary vcf file fot the region variants - VariantDataset ds = (VariantDataset) vd.regionFilter(region); - File vcfFile = new File(tmpDir.getAbsolutePath() + "/variants." + i + ".vcf"); - writer = FileUtils.newBufferedWriter(vcfFile.toPath()); - - // header lines - writer.write("##fileformat=VCFv4.2\n"); - writer.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); - // sample names - for (String key: pedigree.getIndividuals().keySet()) { - writer.write("\t"); - writer.write(pedigree.getIndividuals().get(key).getId()); - } - writer.write("\n"); - - // variant lines -// writer.write(i + " >>>>>>>>>>>>>> " + line + ", num. rows = " + ds.count() + "\n"); -// writer.close(); -// System.out.println("\n\n" + i + " >>>>>>>>>>>>>> " + line + ", num. rows = " + ds.count() + "\n"); -// ds.show(1, false); -// ds.reset(); - - //System.out.println("----------> !!!! rows should be sorted by start !!!" + ds.getSql()); -// ds.executeSql(ds.getSql()).show(); -// System.out.println("----------> rows should be sorted by start !!!"); -// ds.executeSql("SELECT * FROM vcf WHERE (chromosome = '22' AND start >= 32755892 AND end <= 32767063)" -// + " ORDER BY start ASC").show(); -// System.exit(0); - - //List rows = ds.collectAsList(); - List rows = ds.executeSql(ds.getSql()).collectAsList(); - ds.reset(); - - for (Row row: rows) { - List studies = row.getList(12); - Row study = null; - for (int j = 0; j < studies.size(); j++) { - if (studies.get(j).get(0).equals(datasetName)) { - study = studies.get(j); - break; - } - } - - if (study == null) { - throw new Exception("Dataset '" + datasetName + "' not found!"); - } - -// System.out.println("\n\n\n" + row.mkString() + "\n\n"); - List files = study.getList(1); - String filter = (String) files.get(0).getJavaMap(2).get("FILTER"); - String qual = (String) files.get(0).getJavaMap(2).get("QUAL"); - - sb.setLength(0); - sb.append(row.get(2)).append("\t").append(row.get(3)).append("\t") - .append(row.get(0) == null ? "." : row.get(0)).append("\t") - .append(row.get(5)).append("\t").append(row.get(6)).append("\t").append(qual).append("\t") - .append(filter).append("\t").append(".").append("\t").append(study.getList(3).get(0)); - - int j = 0; - try { -// System.out.println("size for (" + i + "): " + sb.toString()); -// System.out.println("\tsize = " + study.getList(4).size()); - for (j = 0; j < pedigree.getIndividuals().size(); j++) { -// System.out.println(j + ": "); -// System.out.println("\t" + ((WrappedArray) study.getList(4).get(j)).head()); - sb.append("\t").append(((WrappedArray) study.getList(4).get(j)).head()); - } - } catch (Exception ex) { - System.out.println("Exception for sample data: " + sb.toString()); - System.out.println("(i, j, sample data size, pedigree size) = (" - + i + ", " + j + ", " + study.getList(4).size() + ", " + pedigree.getIndividuals().size()); - System.out.println(ex.getMessage()); - ex.printStackTrace(); - } - sb.append("\n"); - writer.write(sb.toString()); -// System.out.println(sb.toString()); - } - writer.close(); - - // compress vcf to bgz - sb.setLength(0); - sb.append(props.getProperty("bgzip", this.BGZIP_BIN)).append(" ").append(vcfFile.getAbsolutePath()); - execute(sb.toString(), "Compressing vcf to gz: " + sb.toString()); - - // and create tabix index - sb.setLength(0); - sb.append(props.getProperty("tabix", this.TABIX_BIN)).append(" -p vcf ").append(vcfFile.getAbsolutePath()).append(".gz"); - execute(sb.toString(), "Creating tabix index: " + sb); - - // rvtests command line - sb.setLength(0); - sb.append(props.getProperty("rvtests", this.RVTEST_BIN)) - .append(kernel == null ? " " : " --kernel " + kernel) - .append(" --pheno ").append(phenoFile.getAbsolutePath()) - .append(" --inVcf ").append(vcfFile.getAbsolutePath()).append(".gz") - .append(" --setFile ").append(setFile.getAbsolutePath()) - .append(" --out ").append(tmpDir.getAbsolutePath()).append("/out.").append(i); - execute(sb.toString(), "Running rvtests: " + sb); - - i++; - } - - // write final - for (String k: kernels) { - boolean header = false; - writer = FileUtils.newBufferedWriter(Paths.get(outDirname + "/out." + k + ".assoc")); - for (int j = 0; j < i; j++) { - File file = new File(tmpDir.getAbsolutePath() + "/out." + j + "." + k + ".assoc"); - if (file.exists()) { - reader = FileUtils.newBufferedReader(file.toPath()); - line = reader.readLine(); - if (!header) { - writer.write(line); - writer.write("\n"); - header = true; - } - while ((line = reader.readLine()) != null) { - writer.write(line); - writer.write("\n"); - } - reader.close(); - } - } - writer.close(); - } - } - - public void run00(String datasetName) throws Exception { - // create spark session - SparkConf sparkConf; - if (inFilename.startsWith("/")) { - sparkConf = SparkConfCreator.getConf("variant rvtests", "local", 1, true); - } else { - sparkConf = new SparkConf().setAppName("variant rvtests"); - } - SparkSession sparkSession = new SparkSession(new SparkContext(sparkConf)); - - // load dataset - VariantDataset vd = new VariantDataset(sparkSession); - vd.load(inFilename); - vd.createOrReplaceTempView("vcf"); - - // load rvtests parameters into properties - Properties props = new Properties(); - InputStream confStream = new FileInputStream(confFilename); - props.load(confStream); - confStream.close(); - - List kernels = null; - String kernel = props.getProperty("kernel"); - if (kernel != null) { - kernels = Arrays.asList(kernel.toLowerCase() - .replace("skato", "SkatO") - .replace("skat", "Skat") - .split(",")); - } - - // create temporary directory - File tmpDir = new File(outDirname + "/tmp"); - tmpDir.mkdir(); - - // create temporary file for --pheno - File phenoFile = new File(tmpDir.getAbsolutePath() + "/pheno"); - VariantMetadataManager metadataManager = new VariantMetadataManager(); - metadataManager.load(Paths.get(metaFilename)); - Pedigree pedigree = metadataManager.getPedigree(datasetName); - new PedigreeManager().save(pedigree, phenoFile.toPath()); - - Map pedigreeMap = new LinkedHashMap<>(); - for (String key : pedigree.getIndividuals().keySet()) { - pedigreeMap.put(key, pedigree.getIndividuals().get(key).getId()); - } - - String sql = "SELECT * FROM vcf"; - Dataset rows = vd.sqlContext().sql(sql); - - List regions = new ArrayList<>(); - regions.add("22:40766594-40806293"); - regions.add("22:32755892-32767063"); - regions.add("22:32755894-32766972"); - regions.add("22:29834571-29838444"); - regions.add("22:31835344-31885547"); - regions.add("22:31608249-31676066"); - regions.add("22:38615297-38668670"); - regions.add("22:38822332-38851203"); - regions.add("22:20455993-20461786"); - - JavaPairRDD> groups = rows.javaRDD().groupBy(row -> { - String key = null; - for (String region: regions) { - String[] fields = region.split("[:-]"); - if (fields[0].equals(row.getString(2)) - && Integer.parseInt(fields[1]) <= row.getInt(4) - && Integer.parseInt(fields[2]) >= row.getInt(3)) { - key = region; - break; - } - } - return key; - }); - - JavaRDD results = groups.map(new Function>, String>() { - @Override - public String call(Tuple2> tuple) throws Exception { - StringBuilder sb = new StringBuilder(); - BufferedWriter writer; - Iterator iterator = tuple._2.iterator(); -// System.out.println("+++++++++++++ " + tuple._1); - - String id = tuple._1.replace(":", "_").replace("-", "_"); - - // create temporary files for --inVcf and --setFile - File setFile = new File(tmpDir.getAbsolutePath() + "/setFile." + id); - writer = FileUtils.newBufferedWriter(setFile.toPath()); - writer.write(id + "\t" + tuple._1 + "\n"); - writer.close(); - - File vcfFile = new File(tmpDir.getAbsolutePath() + "/variants." - + id + ".vcf"); - writer = FileUtils.newBufferedWriter(vcfFile.toPath()); - // header lines - writer.write("##fileformat=VCFv4.2\n"); - writer.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"); - // sample names - for (String key : pedigreeMap.keySet()) { - writer.write("\t"); - writer.write(pedigreeMap.get(key)); - } - writer.write("\n"); - while (iterator.hasNext()) { - Row row = iterator.next(); -// System.out.println("=================== " + row); - - // variant lines - List studies = row.getList(12); - Row study = null; - for (int j = 0; j < studies.size(); j++) { - if (studies.get(j).get(0).equals(datasetName)) { - study = studies.get(j); - break; - } - } - - if (study == null) { - throw new Exception("Dataset '" + datasetName + "' not found!"); - } - List files = study.getList(1); - String filter = (String) files.get(0).getJavaMap(2).get("FILTER"); - String qual = (String) files.get(0).getJavaMap(2).get("QUAL"); - - sb.setLength(0); - sb.append(row.get(2)).append("\t").append(row.get(3)).append("\t") - .append(row.get(0) == null ? "." : row.get(0)).append("\t") - .append(row.get(5)).append("\t").append(row.get(6)).append("\t").append(qual).append("\t") - .append(filter).append("\t").append(".").append("\t").append(study.getList(3).get(0)); - - for (int j = 0; j < pedigreeMap.size(); j++) { - sb.append("\t").append(((WrappedArray) study.getList(4).get(j)).head()); - } - sb.append("\n"); - writer.write(sb.toString()); - System.out.println(sb.toString()); - } - writer.close(); - - // compress vcf to bgz - sb.setLength(0); - sb.append(props.getProperty("bgzip")).append(" ").append(vcfFile.getAbsolutePath()); - execute(sb.toString(), "Compressing vcf to gz: " + sb); - - // and create tabix index - sb.setLength(0); - sb.append(props.getProperty("tabix")).append(" -p vcf ").append(vcfFile.getAbsolutePath()).append(".gz"); - execute(sb.toString(), "Creating tabix index: " + sb); - - // rvtests command line - sb.setLength(0); - sb.append(props.getProperty("rvtests")) - .append(kernel == null ? " " : " --kernel " + kernel) - .append(" --pheno ").append(phenoFile.getAbsolutePath()) - .append(" --inVcf ").append(vcfFile.getAbsolutePath()).append(".gz") - .append(" --setFile ").append(setFile.getAbsolutePath()) - .append(" --out ").append(tmpDir.getAbsolutePath()).append("/out.").append(id); - execute(sb.toString(), "Running rvtests: " + sb); - - sb.setLength(0); - File file = new File(tmpDir.getAbsolutePath() + "/out." + id + ".Skat.assoc"); - if (file.exists()) { - BufferedReader reader = FileUtils.newBufferedReader(file.toPath()); - String line = reader.readLine(); - while ((line = reader.readLine()) != null) { - sb.append(line).append("\n"); - } - reader.close(); - } - - return sb.toString(); - } - }); - - System.out.println(results.collect()); - } - - private Process execute(String cmdline, String label) { - Process p = null; - try { - System.out.println(label); - System.out.println("Executing: " + cmdline); - p = Runtime.getRuntime().exec(cmdline); - - System.out.println("\tSTDOUT:"); - System.out.println(readInputStream(p.getInputStream())); - System.out.println("\tSTDERR:"); - System.out.println(readInputStream(p.getErrorStream())); - } catch (Exception e) { - e.printStackTrace(); - } - return p; - } - - private void saveStream(InputStream inputStream, Path path) { - try { - PrintWriter writer = new PrintWriter(path.toFile()); - writer.write(readInputStream(inputStream)); - writer.close(); - } catch (Exception e) { - e.printStackTrace(); - } - } - - private String readInputStream(InputStream inputStream) { - StringBuilder res = new StringBuilder(); - try { - // read the output from the command - BufferedReader stdInput = new BufferedReader(new InputStreamReader(inputStream)); - String s; - System.out.println("Here is the standard output of the command:\n"); - while ((s = stdInput.readLine()) != null) { - res.append(s).append("\n"); - } - } catch (IOException e) { - res.append(e.getMessage()).append("\n"); - } - return res.toString(); - } - - @Override - public void execute() throws AnalysisExecutorException { - } -} diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisExecutor.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisExecutor.java index 6e3fa4e0..ad9f8891 100644 --- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisExecutor.java +++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisExecutor.java @@ -7,13 +7,7 @@ */ public abstract class VariantAnalysisExecutor extends AnalysisExecutor { - protected String studyName; - - public String getStudyName() { - return studyName; - } - - public void setStudyName(String studyName) { - this.studyName = studyName; + protected VariantAnalysisExecutor(String studyId) { + super(studyId); } } diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisUtils.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisUtils.java new file mode 100644 index 00000000..e110b568 --- /dev/null +++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisUtils.java @@ -0,0 +1,287 @@ +package org.opencb.hpg.bigdata.analysis.variant; + +import htsjdk.variant.variantcontext.writer.Options; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.SparkContext; +import org.apache.spark.sql.SparkSession; +import org.opencb.biodata.formats.pedigree.PedigreeManager; +import org.opencb.biodata.models.core.Region; +import org.opencb.biodata.models.core.pedigree.Pedigree; +import org.opencb.biodata.models.metadata.Sample; +import org.opencb.biodata.models.variant.metadata.VariantStudyMetadata; +import org.opencb.biodata.tools.variant.converters.VCFExporter; +import org.opencb.biodata.tools.variant.metadata.VariantMetadataManager; +import org.opencb.commons.datastore.core.Query; +import org.opencb.hpg.bigdata.core.lib.SparkConfCreator; +import org.opencb.hpg.bigdata.core.lib.VariantDataset; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static java.nio.file.Paths.get; + +public class VariantAnalysisUtils { + public static final Pattern OPERATION_PATTERN = Pattern.compile("([^=<>~!]+)(.*)$"); + + public static void exportVCF(String inputAvroFilename, String metaFilename, + FilterParameters filterParams, String vcfFilename) { + // Generate VCF file by calling VCF exporter from query and query options + VariantMetadataManager manager = new VariantMetadataManager(); + try { + manager.load(Paths.get(metaFilename)); + + SparkConf sparkConf = SparkConfCreator.getConf("tool plink", "local", 1, true); + SparkSession sparkSession = new SparkSession(new SparkContext(sparkConf)); + + VariantDataset vd = new VariantDataset(sparkSession); + vd.load(inputAvroFilename); + vd.createOrReplaceTempView("vcf"); + + // Add filters to variant dataset + if (filterParams != null) { + addVariantFilters(filterParams, vd); + } + + // out filename + VariantStudyMetadata studyMetadata = manager.getVariantMetadata().getStudies().get(0); + VCFExporter vcfExporter = new VCFExporter(studyMetadata); + vcfExporter.open(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Paths.get(vcfFilename)); + + vcfExporter.export(vd.iterator()); + + // close everything + vcfExporter.close(); + sparkSession.stop(); + } catch (Exception e) { + System.out.println("Error retrieving variants from Avro to VCF file: " + e.getMessage()); + } + } + + public static void addVariantFilters(FilterParameters filterParams, VariantDataset vd) throws IOException { + VariantMetadataManager metadataManager = new VariantMetadataManager(); + //metadataManager.load(Paths.get(variantCommandOptions.queryVariantCommandOptions.input + ".meta.json")); + + // query for ID (list and file) + List list = null; + if (StringUtils.isNotEmpty(filterParams.ids)) { + list = Arrays.asList(StringUtils.split(filterParams.ids, ",")); + } + String idFilename = filterParams.idFilename; + if (StringUtils.isNotEmpty(idFilename) && new File(idFilename).exists()) { + if (list == null) { + list = Files.readAllLines(get(idFilename)); + } else { + list.addAll(Files.readAllLines(get(idFilename))); + } + } + if (list != null) { + vd.idFilter(list, false); + } + + // query for type + if (StringUtils.isNotEmpty(filterParams.types)) { + vd.typeFilter(Arrays.asList( + StringUtils.split(filterParams.types, ","))); + } + + // query for biotype + if (StringUtils.isNotEmpty(filterParams.biotypes)) { + vd.annotationFilter("biotype", Arrays.asList( + StringUtils.split(filterParams.biotypes, ","))); + } + + // query for study + if (StringUtils.isNotEmpty(filterParams.studies)) { + vd.studyFilter("studyId", Arrays.asList( + StringUtils.split(filterParams.studies, ","))); + } + + // query for maf (study:cohort) + if (StringUtils.isNotEmpty(filterParams.maf)) { + vd.studyFilter("stats.maf", filterParams.maf); + } + + // query for mgf (study:cohort) + if (StringUtils.isNotEmpty(filterParams.mgf)) { + vd.studyFilter("stats.mgf", filterParams.mgf); + } + + // query for number of missing alleles (study:cohort) + if (StringUtils.isNotEmpty(filterParams.missingAlleles)) { + vd.studyFilter("stats.missingAlleles", filterParams.missingAlleles); + } + + // query for number of missing genotypes (study:cohort) + if (StringUtils.isNotEmpty(filterParams.missingGenotypes)) { + vd.studyFilter("stats.missingGenotypes", filterParams.missingGenotypes); + } + + // query for region (list and file) + List regions = getRegionList(filterParams.regions, + filterParams.regionFilename); + if (regions != null && regions.size() > 0) { + vd.regionFilter(regions); + } + + // query for sample genotypes and/or sample filters + StringBuilder sampleGenotypes = new StringBuilder(); + if (StringUtils.isNotEmpty(filterParams.sampleGenotypes)) { + sampleGenotypes.append(filterParams.sampleGenotypes); + } + String sampleFilters = filterParams.sampleFilters; + if (StringUtils.isNotEmpty(sampleGenotypes) || StringUtils.isNotEmpty(sampleFilters)) { + // TODO: we need the ID for dataset target + List samples = null; + + + if (StringUtils.isNotEmpty(sampleFilters)) { + Query sampleQuery = new Query(); +// final Pattern OPERATION_PATTERN = Pattern.compile("([^=<>~!]+.*)(<=?|>=?|!=|!?=?~|==?)([^=<>~!]+.*)$"); + String[] splits = sampleFilters.split("[;]"); + for (int i = 0; i < splits.length; i++) { + Matcher matcher = OPERATION_PATTERN.matcher(splits[i]); + if (matcher.matches()) { + sampleQuery.put(matcher.group(1), matcher.group(2)); + } + } + + samples = metadataManager.getSamples(sampleQuery, + metadataManager.getVariantMetadata().getStudies().get(0).getId()); + + for (Sample sample : samples) { + if (sampleGenotypes.length() > 0) { + sampleGenotypes.append(";"); + } + //sampleGenotypes.append(sample.getId()).append(":0|1,1|0,1|1"); + sampleGenotypes.append(sample.getId()).append(":1|1"); + } + } + samples = metadataManager.getSamples( + metadataManager.getVariantMetadata().getStudies().get(0).getId()); + + // e.g.: sample genotypes = sample1:0|0;sample2:1|0,1|1 + String[] values = sampleGenotypes.toString().split("[;]"); + StringBuilder newSampleGenotypes = new StringBuilder(); + if (values == null) { + newSampleGenotypes.append(updateSampleGenotype(sampleGenotypes.toString(), samples)); + } else { + newSampleGenotypes.append(updateSampleGenotype(values[0], samples)); + for (int i = 1; i < values.length; i++) { + newSampleGenotypes.append(";"); + newSampleGenotypes.append(updateSampleGenotype(values[i], samples)); + } + } + if (!StringUtils.isEmpty(newSampleGenotypes)) { + vd.sampleFilter("GT", newSampleGenotypes.toString()); + } else { + System.err.format("Error: could not parse your sample genotypes %s.\n", sampleGenotypes); + } + } + + // query for consequence type (Sequence Ontology term names and accession codes) + annotationFilterNotEmpty("consequenceTypes.sequenceOntologyTerms", filterParams.consequenceTypes, vd); + + // query for consequence type (gene names) + annotationFilterNotEmpty("consequenceTypes.geneName", filterParams.genes, vd); + + // query for clinvar (accession) + annotationFilterNotEmpty("variantTraitAssociation.clinvar.accession", filterParams.clinvar, vd); + + // query for cosmic (mutation ID) + annotationFilterNotEmpty("variantTraitAssociation.cosmic.mutationId", filterParams.cosmic, vd); + + // query for conservation (phastCons, phylop, gerp) + annotationFilterNotEmpty("conservation", filterParams.conservScores, vd); + + // query for protein substitution scores (polyphen, sift) + annotationFilterNotEmpty("consequenceTypes.proteinVariantAnnotation.substitutionScores", filterParams.substScores, vd); + + // query for alternate population frequency (study:population) + annotationFilterNotEmpty("populationFrequencies.altAlleleFreq", filterParams.pf, vd); + + // query for population minor allele frequency (study:population) + annotationFilterNotEmpty("populationFrequencies.refAlleleFreq", filterParams.pmaf, vd); + } + + private static void annotationFilterNotEmpty(String key, String value, VariantDataset vd) { + if (StringUtils.isNotEmpty(value)) { + vd.annotationFilter(key, value); + } + } + + public static List getRegionList(String regions, String regionFilename) throws IOException { + List list = null; + if (StringUtils.isNotEmpty(regions)) { + list = Region.parseRegions(regions); + } + if (StringUtils.isNotEmpty(regionFilename) && new File(regionFilename).exists()) { + if (regions == null) { + list = new ArrayList<>(); + } + List lines = Files.readAllLines(Paths.get(regionFilename)); + for (String line : lines) { + list.add(new Region(line)); + } + } + return list; + } + + /** + * Update the sample genotype query string by replacing the sample name by + * its sample order, e.g.: from sample2:1|0,1|1 to 32:1|0,1|1. + * + * @param sampleGenotype Sample genotype query string + * @param samples Sample list in the right order (to get the sample index) + * @return Updated sample genotype query string + */ + private static String updateSampleGenotype(String sampleGenotype, List samples) { + // e.g.: value = sample2:1|0,1|1 + StringBuilder newSampleGenotype = new StringBuilder(""); + String[] splits = sampleGenotype.split("[:]"); + if (splits == null) { + // error + System.err.format("Error: invalid expresion %s for sample genotypes.\n", sampleGenotype); + } else { + boolean found = false; + // TODO: move this functionality to the VariantMetadataManager (from sample name to sample index) + for (int i = 0; i < samples.size(); i++) { + if (splits[0].equals(samples.get(i).getId())) { + newSampleGenotype.append(i).append(":").append(splits[1]); + found = true; + break; + } + } + // Sanity check + if (!found) { + // error + System.err.format("Error: sample %s not found in dataset.\n", splits[0]); + } + } + System.out.println(sampleGenotype + " -> " + newSampleGenotype.toString()); + return newSampleGenotype.toString(); + } + + /** + * Export variant metadata into pedigree file format. + * + * @param metaFilename Variant metadata file name + * @param studyId Study ID target + * @param pedFilename Pedigree file name + * @throws IOException IO exception + */ + public static void exportPedigree(String metaFilename, String studyId, String pedFilename) throws IOException { + VariantMetadataManager metadataManager = new VariantMetadataManager(); + metadataManager.load(Paths.get(metaFilename)); + Pedigree pedigree = metadataManager.getPedigree(studyId); + PedigreeManager pedigreeManager = new PedigreeManager(); + pedigreeManager.save(pedigree, Paths.get(pedFilename)); + } +} diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/tools/adaptors/PlinkAdaptor.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/PlinkWrapper.java similarity index 89% rename from hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/tools/adaptors/PlinkAdaptor.java rename to hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/PlinkWrapper.java index 6579b403..6457b0d4 100644 --- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/tools/adaptors/PlinkAdaptor.java +++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/PlinkWrapper.java @@ -1,4 +1,4 @@ -package org.opencb.hpg.bigdata.analysis.tools.adaptors; +package org.opencb.hpg.bigdata.analysis.variant.wrappers; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectReader; @@ -14,6 +14,7 @@ import org.opencb.hpg.bigdata.analysis.tools.Executor; import org.opencb.hpg.bigdata.analysis.tools.Status; import org.opencb.hpg.bigdata.analysis.tools.ToolManager; +import org.opencb.hpg.bigdata.analysis.variant.VariantAnalysisExecutor; import org.opencb.hpg.bigdata.core.lib.SparkConfCreator; import org.opencb.hpg.bigdata.core.lib.VariantDataset; import org.slf4j.Logger; @@ -24,19 +25,20 @@ import java.util.HashMap; import java.util.Map; -public class PlinkAdaptor { +public class PlinkWrapper extends VariantAnalysisWrapper { private Query query; private QueryOptions queryOptions; private Logger logger; - public PlinkAdaptor() { - this.logger = LoggerFactory.getLogger(PlinkAdaptor.class); + public PlinkWrapper(String studyId) { + super(studyId); + this.logger = LoggerFactory.getLogger(PlinkWrapper.class); } - public void run() { + public void execute() { String inputAvroFilename = query.getString("input"); String tmpVcfFilename = query.get("outdir") + "/tmp.vcf"; @@ -83,7 +85,7 @@ public void run() { //params.put("output", "/tmp/test.bam.bai"); String commandLine = toolManager.createCommandLine("plink", "index", params); - System.out.println(commandLine); Path tmp = Paths.get("/tmp"); + System.out.println(commandLine); Executor.execute(commandLine, tmp, true); diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/RvTestsWrapper.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/RvTestsWrapper.java new file mode 100644 index 00000000..f61c728f --- /dev/null +++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/RvTestsWrapper.java @@ -0,0 +1,95 @@ +package org.opencb.hpg.bigdata.analysis.variant.wrappers; + +import org.opencb.hpg.bigdata.analysis.AnalysisExecutorException; +import org.opencb.hpg.bigdata.analysis.exceptions.AnalysisToolException; +import org.opencb.hpg.bigdata.analysis.tools.Executor; +import org.opencb.hpg.bigdata.analysis.variant.FilterParameters; +import org.opencb.hpg.bigdata.analysis.variant.VariantAnalysisExecutor; +import org.opencb.hpg.bigdata.analysis.variant.VariantAnalysisUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Map; + +/** + * Created by joaquin on 1/19/17. + */ +//public class RvTestsWrapper extends AnalysisExecutor implements Serializable { +public class RvTestsWrapper extends VariantAnalysisWrapper { // extends AnalysisExecutor implements Serializable { + private String inFilename; + private String metaFilename; + private FilterParameters filterParams; + private Map rvtestsParams; + + private Logger logger; + + public RvTestsWrapper(String studyId, String inFilename, String metaFilename, + FilterParameters filterParams, Map rvtestsParams) { + super(studyId); + this.inFilename = inFilename; + this.metaFilename = metaFilename; + this.filterParams = filterParams; + this.rvtestsParams = rvtestsParams; + + this.logger = LoggerFactory.getLogger(PlinkWrapper.class); + } + + @Override + public void execute() throws AnalysisExecutorException { + // Sanity chek + if (binPath == null || !binPath.toFile().exists()) { + String msg = "RvTests binary path is missing."; + logger.error("RvTests binary path is missing."); + throw new AnalysisExecutorException("RvTests binary path is missing."); + } + + // Get output dir + Path outDir = Paths.get("/tmp"); + if (rvtestsParams.get("out") != null) { + outDir = Paths.get(rvtestsParams.get("out")).getParent(); + } + + // Export target variants to VCF file + String vcfFilename = outDir.toString() + "/tmp.vcf"; + VariantAnalysisUtils.exportVCF(inFilename, metaFilename, filterParams, vcfFilename); + + // Export pedigree + String pedFilename = outDir.toString() + "/tmp.vcf.ped"; + try { + VariantAnalysisUtils.exportPedigree(metaFilename, studyId, pedFilename); + } catch (IOException e) { + logger.error(e.getMessage()); + throw new AnalysisExecutorException(e); + } + + StringBuilder sb = new StringBuilder(); +/* + // Compress vcf to bgz + sb.setLength(0); + sb.append(BGZIP_BIN).append(" ").append(vcfFilename); + Executor.execute(sb.toString(), outDir, true); + + // ...create tabix index + sb.setLength(0); + sb.append(TABIX_BIN).append(" -p vcf ").append(vcfFilename).append(".gz"); + Executor.execute(sb.toString(), outDir, true); +*/ + // ...and finally, run rvtests + sb.setLength(0); + sb.append(binPath); + sb.append(" --inVcf ").append(vcfFilename).append(".gz"); + sb.append(" --pheno ").append(pedFilename); + for (String key: rvtestsParams.keySet()) { + sb.append(" --").append(key).append(" ").append(rvtestsParams.get(key)); + } + try { + Executor.execute(sb.toString(), outDir, true); + } catch (AnalysisToolException e) { + logger.error(e.getMessage()); + throw new AnalysisExecutorException(e); + } + } +} diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/VariantAnalysisWrapper.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/VariantAnalysisWrapper.java new file mode 100644 index 00000000..c1d1228b --- /dev/null +++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/VariantAnalysisWrapper.java @@ -0,0 +1,22 @@ +package org.opencb.hpg.bigdata.analysis.variant.wrappers; + +import org.opencb.hpg.bigdata.analysis.variant.VariantAnalysisExecutor; + +import java.nio.file.Path; + +public abstract class VariantAnalysisWrapper extends VariantAnalysisExecutor { + + protected Path binPath; + + protected VariantAnalysisWrapper(String studyId) { + super(studyId); + } + + public void setBinPath(Path binPath) { + this.binPath = binPath; + } + + public Path getBinPath() { + return binPath; + } +} diff --git a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/CliUtils.java b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/CliUtils.java index 37d2c702..75e06a02 100644 --- a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/CliUtils.java +++ b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/CliUtils.java @@ -43,6 +43,7 @@ public static String getOutputFilename(String input, String output, String to) t return res; } + @Deprecated public static List getRegionList(String regions, String regionFilename) throws IOException { List list = null; if (StringUtils.isNotEmpty(regions)) { @@ -97,6 +98,7 @@ public static void saveDatasetAsOneFile(ParentDataset ds, String format, String dir.delete(); } + @Deprecated public static void addVariantFilters(VariantCommandOptions variantCommandOptions, VariantDataset vd) throws IOException { @@ -253,6 +255,7 @@ public static void addVariantFilters(VariantCommandOptions variantCommandOptions variantCommandOptions.queryVariantCommandOptions.pmaf, vd); } + @Deprecated private static void annotationFilterNotEmpty(String key, String value, VariantDataset vd) { if (StringUtils.isNotEmpty(value)) { vd.annotationFilter(key, value); @@ -282,6 +285,7 @@ public static VariantCommandOptions createVariantCommandOptions( * @param samples Sample list in the right order (to get the sample index) * @return Updated sample genotype query string */ + @Deprecated private static String updateSampleGenotype(String sampleGenotype, List samples) { // e.g.: value = sample2:1|0,1|1 StringBuilder newSampleGenotype = new StringBuilder(""); diff --git a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/LocalCliOptionsParser.java b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/LocalCliOptionsParser.java index fe207e79..c31221c5 100644 --- a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/LocalCliOptionsParser.java +++ b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/LocalCliOptionsParser.java @@ -20,6 +20,7 @@ import com.beust.jcommander.Parameter; import com.beust.jcommander.ParameterDescription; import com.beust.jcommander.ParameterException; +import org.opencb.hpg.bigdata.analysis.variant.FilterParameters; import org.opencb.hpg.bigdata.app.cli.local.options.*; import java.util.Map; @@ -35,6 +36,7 @@ public class LocalCliOptionsParser { private final CommandOptions commandOptions; private final CommonCommandOptions commonCommandOptions; + private final FilterParameters filterOptions; private AdminCommandOptions adminCommandOptions; @@ -55,6 +57,7 @@ public LocalCliOptionsParser() { commandOptions = new CommandOptions(); commonCommandOptions = new CommonCommandOptions(); + filterOptions = new FilterParameters(); adminCommandOptions = new AdminCommandOptions(commonCommandOptions, jcommander); jcommander.addCommand("admin", adminCommandOptions); @@ -77,7 +80,7 @@ public LocalCliOptionsParser() { alignmentSubCommands.addCommand("coverage", alignmentCommandOptions.coverageAlignmentCommandOptions); alignmentSubCommands.addCommand("query", alignmentCommandOptions.queryAlignmentCommandOptions); - variantCommandOptions = new VariantCommandOptions(commonCommandOptions, jcommander); + variantCommandOptions = new VariantCommandOptions(commonCommandOptions, filterOptions, jcommander); jcommander.addCommand("variant", variantCommandOptions); JCommander variantSubCommands = jcommander.getCommands().get("variant"); variantSubCommands.addCommand("convert", variantCommandOptions.convertVariantCommandOptions); diff --git a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/executors/VariantCommandExecutor.java b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/executors/VariantCommandExecutor.java index 3abe2138..940e3753 100644 --- a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/executors/VariantCommandExecutor.java +++ b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/executors/VariantCommandExecutor.java @@ -31,10 +31,10 @@ import org.opencb.biodata.models.metadata.SampleSetType; import org.opencb.biodata.models.variant.avro.StudyEntry; import org.opencb.biodata.models.variant.avro.VariantAvro; -import org.opencb.biodata.models.variant.avro.VariantFileMetadata; +import org.opencb.biodata.models.variant.metadata.VariantFileMetadata; import org.opencb.biodata.tools.variant.metadata.VariantMetadataManager; import org.opencb.commons.utils.FileUtils; -import org.opencb.hpg.bigdata.analysis.variant.RvTestsAdaptor; +import org.opencb.hpg.bigdata.analysis.variant.wrappers.RvTestsWrapper; import org.opencb.hpg.bigdata.app.cli.CommandExecutor; import org.opencb.hpg.bigdata.app.cli.local.CliUtils; import org.opencb.hpg.bigdata.app.cli.local.options.VariantCommandOptions; @@ -53,7 +53,7 @@ import static java.nio.file.Paths.get; -//import org.opencb.hpg.bigdata.analysis.variant.analysis.RvTestsAdaptor; +//import org.opencb.hpg.bigdata.analysis.variant.analysis.RvTestsWrapper; /** * Created by imedina on 25/06/15. @@ -193,7 +193,7 @@ private void convert() throws IOException { ObjectMapper mapper = new ObjectMapper(); //mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL); VariantFileMetadata data = mapper.readValue(metaFile, VariantFileMetadata.class); - data.setFileName(outMetaFile.toString()); + data.setId(outMetaFile.toString()); // write the metadata PrintWriter writer = new PrintWriter(new FileOutputStream(outMetaFile)); @@ -371,6 +371,9 @@ public void metadata() throws Exception { if (variantCommandOptions.metadataVariantCommandOptions.loadPedFilename != null) { Pedigree pedigree = new PedigreeManager().parse( get(variantCommandOptions.metadataVariantCommandOptions.loadPedFilename)); + + System.out.println(pedigree.toString()); + metadataManager.loadPedigree(pedigree, datasetId); updated = true; } @@ -418,12 +421,14 @@ public void metadata() throws Exception { public void rvtests() throws Exception { - RvTestsAdaptor rvtests = new RvTestsAdaptor(variantCommandOptions.rvtestsVariantCommandOptions.inFilename, - variantCommandOptions.rvtestsVariantCommandOptions.metaFilename, - variantCommandOptions.rvtestsVariantCommandOptions.outDirname, - variantCommandOptions.rvtestsVariantCommandOptions.confFilename); - -// rvtests.run(variantCommandOptions.rvtestsVariantCommandOptions.datasetId); - rvtests.run00(variantCommandOptions.rvtestsVariantCommandOptions.datasetId); + RvTestsWrapper rvtests = new RvTestsWrapper(variantCommandOptions.rvtestsVariantCommandOptions.datasetId, + variantCommandOptions.rvtestsVariantCommandOptions.inFilename, + variantCommandOptions.rvtestsVariantCommandOptions.inFilename + ".meta.json", + variantCommandOptions.rvtestsVariantCommandOptions.filterParameters, + variantCommandOptions.rvtestsVariantCommandOptions.rvtestsParams); + + // TODO: get the binary path from config folder + rvtests.setBinPath(Paths.get("/home/jtarraga/soft/rvtests/executable/rvtest")); + rvtests.execute(); } } diff --git a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/options/VariantCommandOptions.java b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/options/VariantCommandOptions.java index 67a5b160..1b299923 100644 --- a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/options/VariantCommandOptions.java +++ b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/options/VariantCommandOptions.java @@ -1,12 +1,13 @@ package org.opencb.hpg.bigdata.app.cli.local.options; -import com.beust.jcommander.JCommander; -import com.beust.jcommander.Parameter; -import com.beust.jcommander.Parameters; -import com.beust.jcommander.ParametersDelegate; +import com.beust.jcommander.*; import org.apache.parquet.hadoop.ParquetWriter; +import org.opencb.hpg.bigdata.analysis.variant.FilterParameters; import org.opencb.hpg.bigdata.app.cli.local.LocalCliOptionsParser; +import java.util.HashMap; +import java.util.Map; + /** * Created by jtarraga on 01/06/17. */ @@ -23,6 +24,7 @@ public class VariantCommandOptions { public RvTestsVariantCommandOptions rvtestsVariantCommandOptions; public LocalCliOptionsParser.CommonCommandOptions commonCommandOptions; + public FilterParameters commonFilterOptions; public JCommander jCommander; public VariantCommandOptions(LocalCliOptionsParser.CommonCommandOptions commonCommandOptions, @@ -38,6 +40,21 @@ public VariantCommandOptions(LocalCliOptionsParser.CommonCommandOptions commonCo this.rvtestsVariantCommandOptions = new RvTestsVariantCommandOptions(); } + public VariantCommandOptions(LocalCliOptionsParser.CommonCommandOptions commonCommandOptions, + FilterParameters commonFilterOptions, + JCommander jCommander) { + this.commonCommandOptions = commonCommandOptions; + this.commonFilterOptions = commonFilterOptions; + this.jCommander = jCommander; + + this.convertVariantCommandOptions = new ConvertVariantCommandOptions(); + this.annotateVariantCommandOptions = new AnnotateVariantCommandOptions(); + this.viewVariantCommandOptions = new ViewVariantCommandOptions(); + this.queryVariantCommandOptions = new QueryVariantCommandOptions(); + this.metadataVariantCommandOptions = new MetadataVariantCommandOptions(); + this.rvtestsVariantCommandOptions = new RvTestsVariantCommandOptions(); + } + @Parameters(commandNames = {"convert"}, commandDescription = "Convert gVCF/VCF files to different big data" + " formats such as Avro and Parquet using OpenCB models") public class ConvertVariantCommandOptions { @@ -331,10 +348,16 @@ public class RvTestsVariantCommandOptions { @ParametersDelegate public LocalCliOptionsParser.CommonCommandOptions commonOptions = commonCommandOptions; + @ParametersDelegate + public FilterParameters filterParameters = commonFilterOptions; + + @Parameter(names = {"--dataset"}, description = "Target dataset.", arity = 1) + public String datasetId = null; + @Parameter(names = {"-i", "--input"}, description = "Input file name (in Avro/Parquet file format).", required = true, arity = 1) public String inFilename; - +/* @Parameter(names = {"-m", "--metadata"}, description = "Input metadata file name.", required = true, arity = 1) public String metaFilename; @@ -343,12 +366,8 @@ public class RvTestsVariantCommandOptions { required = true, arity = 1) public String outDirname; - @Parameter(names = {"--dataset"}, description = "Target dataset.", - arity = 1) - public String datasetId = null; - - @Parameter(names = {"-c", "--config"}, description = "Configuration file name containing the rvtests parameters.", - required = true, arity = 1) - public String confFilename; +*/ + @DynamicParameter(names = "-D", description = "RvTests parameters") + public Map rvtestsParams = new HashMap<>(); } } diff --git a/hpg-bigdata-app/src/test/java/org/opencb/hpg/bigdata/app/cli/local/VariantRvTestsCLITest.java b/hpg-bigdata-app/src/test/java/org/opencb/hpg/bigdata/app/cli/local/VariantRvTestsCLITest.java index 2a3e64b0..e25cd198 100644 --- a/hpg-bigdata-app/src/test/java/org/opencb/hpg/bigdata/app/cli/local/VariantRvTestsCLITest.java +++ b/hpg-bigdata-app/src/test/java/org/opencb/hpg/bigdata/app/cli/local/VariantRvTestsCLITest.java @@ -2,7 +2,6 @@ import org.junit.Test; -import java.net.URISyntaxException; import java.nio.file.Path; import java.nio.file.Paths; @@ -10,36 +9,60 @@ * Created by joaquin on 1/19/17. */ public class VariantRvTestsCLITest { - Path inPath; - Path outPath; - Path confPath; - - private void init() throws URISyntaxException { - String root = "/home/jtarraga/data100/vcf/rvtest-skat/spark"; -// String root = "/home/jtarraga/data/vcf/skat/"; - inPath = Paths.get(root + "/example.vcf.avro"); - outPath = Paths.get(root + "/out"); - confPath = Paths.get(root + "/skat.params"); + String datasetName = "test"; + + String vcfFilename = "../hpg-bigdata-app/src/test/resources/example.vcf"; + String phenoFilename = "../hpg-bigdata-app/src/test/resources/pheno"; + String outDir = "/tmp/"; + + Path vcfPath; + Path phenoPath; + Path avroPath; + Path metaPath; + + private void init() throws Exception { + vcfPath = Paths.get(vcfFilename); + phenoPath = Paths.get(phenoFilename); + avroPath = Paths.get(outDir + "/" + vcfPath.getFileName() + ".avro"); + metaPath = Paths.get(outDir + "/" + vcfPath.getFileName() + ".avro.meta.json"); + + avroPath.toFile().delete(); + metaPath.toFile().delete(); + + // convert vcf to avro + StringBuilder commandLine = new StringBuilder(); + commandLine.append(" variant convert"); + commandLine.append(" --log-level ERROR"); + commandLine.append(" -i ").append(vcfPath); + commandLine.append(" -o ").append(avroPath); + commandLine.append(" --dataset ").append(datasetName); + VariantQueryCLITest.execute(commandLine.toString()); + + // load pedigree file + commandLine.setLength(0); + commandLine.append(" variant metadata"); + commandLine.append(" --log-level ERROR"); + commandLine.append(" -i ").append(avroPath); + commandLine.append(" --load-pedigree ").append(phenoPath); + commandLine.append(" --dataset ").append(datasetName); + VariantQueryCLITest.execute(commandLine.toString()); } @Test public void skat() { - try { init(); StringBuilder commandLine = new StringBuilder(); commandLine.append(" variant rvtests"); commandLine.append(" --log-level ERROR"); - commandLine.append(" -i ").append(inPath); - commandLine.append(" -o ").append(outPath); - commandLine.append(" -c ").append(confPath); - commandLine.append(" --dataset noname"); - + commandLine.append(" --dataset ").append(datasetName); + commandLine.append(" -i ").append(avroPath); + commandLine.append(" -Dsingle=wald"); + commandLine.append(" -Dout=/tmp/out.wald"); VariantQueryCLITest.execute(commandLine.toString()); } catch (Exception e) { e.printStackTrace(); } } - } diff --git a/hpg-bigdata-app/src/test/resources/example.vcf b/hpg-bigdata-app/src/test/resources/example.vcf new file mode 100755 index 00000000..a2358409 --- /dev/null +++ b/hpg-bigdata-app/src/test/resources/example.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.1 +##FILTER= +##fileDate=20140730 +##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz +##source=1000GenomesPhase3Pipeline +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT P1 P2 P3 P4 P5 P6 P7 P8 P9 +1 1 . A G 100 . . GT 0/1 0/0 0/0 1/1 0/1 0/0 0/1 0/1 0/0 +1 2 . A G 100 . . GT 0/0 0/0 0/0 0/0 0/0 0/0 0/1 0/0 0/1 +1 3 . A G 100 . . GT 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/0 0/0 diff --git a/hpg-bigdata-app/src/test/resources/pheno b/hpg-bigdata-app/src/test/resources/pheno new file mode 100755 index 00000000..608afd9b --- /dev/null +++ b/hpg-bigdata-app/src/test/resources/pheno @@ -0,0 +1,10 @@ +#fid iid fatid matid sex phenotype y1 y2 y3 y4 +P1 P1 0 0 0 2 1.911 -1.465 -0.817 1 +P2 P2 0 0 2 2 2.146 -2.451 -0.178 2 +P3 P3 0 0 2 1 1.086 -1.194 -0.899 1 +P4 P4 0 0 2 1 0.704 -1.052 -0.237 1 +P5 P5 0 0 1 2 2.512 -3.085 -2.579 1 +P6 P6 0 0 2 2 1.283 -1.294 -0.342 1 +P7 P7 0 0 1 2 2.384 1.070 -1.522 2 +P8 P8 0 0 1 1 3.004 0.680 -1.875 1 +P9 P9 0 0 1 1 0.714 -0.116 -1.604 1