diff --git a/hpg-bigdata-analysis/pom.xml b/hpg-bigdata-analysis/pom.xml
index 768cb6d4..f43f97be 100644
--- a/hpg-bigdata-analysis/pom.xml
+++ b/hpg-bigdata-analysis/pom.xml
@@ -84,6 +84,11 @@
junit
junit
+
+ com.beust
+ jcommander
+ 1.58
+
\ No newline at end of file
diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/AnalysisExecutor.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/AnalysisExecutor.java
index 9444df34..8ec3aafd 100644
--- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/AnalysisExecutor.java
+++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/AnalysisExecutor.java
@@ -4,15 +4,19 @@
* Created by jtarraga on 30/01/17.
*/
public abstract class AnalysisExecutor {
- protected String datasetName;
+ protected String studyId;
- public String getDatasetName() {
- return datasetName;
+ protected AnalysisExecutor(String studyId) {
+ this.studyId = studyId;
}
- public void setDatasetName(String datasetName) {
- this.datasetName = datasetName;
+ protected String studyId() {
+ return studyId;
}
- public abstract void execute() throws AnalysisExecutorException;
+ protected void setStudyId(String studyId) {
+ this.studyId = studyId;
+ }
+
+ protected abstract void execute() throws AnalysisExecutorException;
}
diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/FilterParameters.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/FilterParameters.java
new file mode 100644
index 00000000..bdbb7e66
--- /dev/null
+++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/FilterParameters.java
@@ -0,0 +1,105 @@
+package org.opencb.hpg.bigdata.analysis.variant;
+
+import com.beust.jcommander.Parameter;
+
+public class FilterParameters {
+ @Parameter(names = {"--id"}, description = "Query for ID; comma separated list of IDs, e.g.:"
+ + " \"rs312411,rs421225\"", arity = 1)
+ public String ids;
+
+ @Parameter(names = {"--id-file"}, description = "Query for ID that are stored in a file, one ID per line,"
+ + " e.g.: rs312411", arity = 1)
+ public String idFilename;
+
+ @Parameter(names = {"--type"}, description = "Query for type; comma separated list of IDs, e.g.:"
+ + " \"INDEL,SNP,SNV\"", arity = 1)
+ public String types;
+
+ @Parameter(names = {"--s", "--study"}, description = "Query for study; comma separated list of study names",
+ arity = 1)
+ public String studies;
+
+ @Parameter(names = {"--biotype"}, description = "Query for biotype; comma separated list of biotype names,"
+ + " e.g.: protein_coding, pseudogene", arity = 1)
+ public String biotypes;
+
+ @Parameter(names = {"-r", "--region"}, description = "Query for region; comma separated list of regions,"
+ + " e.g.: 1:300000-400000000,15:343453463-8787665654", arity = 1)
+ public String regions;
+
+ @Parameter(names = {"--region-file"}, description = "Query for regions that are stored in a file, one region"
+ + " per line, e.g.: 1:6700000-560000000", arity = 1)
+ public String regionFilename;
+
+ @Parameter(names = {"--maf"}, description = "Query for the Minor Allele Frequency of a given study and"
+ + " cohort. Use the following format enclosed with double quotes: \"study_name::cohort_name"
+ + "[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all>0.4\"", arity = 1)
+ public String maf;
+
+ @Parameter(names = {"--mgf"}, description = "Query for the Minor Genotype Frequency of a given study and"
+ + " cohort. Use the following format enclosed with double quotes: \"study_name::cohort_name"
+ + "[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all>0.18198\"", arity = 1)
+ public String mgf;
+
+ @Parameter(names = {"--missing-allele"}, description = "Query for the number of missing alleles of a given"
+ + " study and cohort. Use the following format enclosed with double quotes: \"study_name::cohort_name"
+ + "[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all==5\"", arity = 1)
+ public String missingAlleles;
+
+ @Parameter(names = {"--missing-genotype"}, description = "Query for the number of missing genotypes of a"
+ + " given study and cohort. Use the following format enclosed with double quotes: \"study_name::"
+ + "cohort_name[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all!=0\"", arity = 1)
+ public String missingGenotypes;
+
+ @Parameter(names = {"--ct", "--consequence-type"}, description = "Query for Sequence Ontology term names or"
+ + " accession codes; comma separated (use double quotes if you provide term names), e.g.:"
+ + " \"transgenic insertion,SO:32234,SO:00124\"", arity = 1)
+ public String consequenceTypes;
+
+ @Parameter(names = {"--gene"}, description = "Query for gene; comma separated list of gene names, e.g.:"
+ + " \"BIN3,ZNF517\"", arity = 1)
+ public String genes;
+
+ @Parameter(names = {"--clinvar"}, description = "Query for clinvar (accession); comma separated list of"
+ + " accessions", arity = 1)
+ public String clinvar;
+
+ @Parameter(names = {"--cosmic"}, description = "Query for cosmic (mutation ID); comma separated list of"
+ + " mutations IDs", arity = 1)
+ public String cosmic;
+
+// @Parameter(names = {"--gwas"}, description = "Query for gwas (traits); comma separated list of traits",
+// arity = 1)
+// public String gwas;
+
+ @Parameter(names = {"--conservation"}, description = "Query for conservation scores (phastCons, phylop, gerp);"
+ + "comma separated list of scores and enclosed with double quotes, e.g.: \"phylop<0.3,phastCons<0.1\"",
+ arity = 1)
+ public String conservScores;
+
+ @Parameter(names = {"--ps", "--protein-substitution"}, description = "Query for protein substitution scores"
+ + " (polyphen, sift); comma separated list of scores and enclosed with double quotes, e.g.:"
+ + "\"polyphen>0.3,sift>0.6\"", arity = 1)
+ public String substScores;
+
+ @Parameter(names = {"--pf", "--population-frequency"}, description = "Query for alternate population"
+ + " frequency of a given study. Use the following format enclosed with double quotes:"
+ + " \"study_name::population_name[<|>|<=|>=|==|!=]frequency_value\", e.g.: \"1000g::CEU<0.4\"",
+ arity = 1)
+ public String pf;
+
+ @Parameter(names = {"--pmaf", "--population-maf"}, description = "Query for population minor allele frequency"
+ + " of a given study. Use the following the format enclosed with double quotes: \"study_name::"
+ + "population_name[<|>|<=|>=|==|!=]frequency_value\", e.g.: \"1000g::PJL<=0.25\"", arity = 1)
+ public String pmaf;
+
+ @Parameter(names = {"--sample-genotype"}, description = "Query for sample genotypes. Use the following the"
+ + " format enclosed with double quotes: \"sample_name1:genotype1,genotype;sample_name2:genotype1\","
+ + " e.g.: \"HG00112:0/0;HG23412:1/0,1/1\"", arity = 1)
+ public String sampleGenotypes;
+
+ @Parameter(names = {"--sample-filter"}, description = "Query for sample filter, i.e.: individual attributes (family, father,"
+ + " mother, sex and phenotype) and user-defined attributes from pedigree information,"
+ + " e.g.: \"individual.sex=MALE;Eyes=Blue\"", arity = 1)
+ public String sampleFilters;
+}
diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LinearRegressionAnalysis.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LinearRegressionAnalysis.java
index b83b5f89..ca5ca534 100644
--- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LinearRegressionAnalysis.java
+++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LinearRegressionAnalysis.java
@@ -63,14 +63,13 @@ public void execute() {
System.out.println("r2: " + trainingSummary.r2());
}
- public LinearRegressionAnalysis(String datasetName, String studyName, String depVarName, String indepVarName) {
- this(datasetName, studyName, depVarName, indepVarName, 10, 0.3, 0.8);
+ public LinearRegressionAnalysis(String studyId, String depVarName, String indepVarName) {
+ this(studyId, depVarName, indepVarName, 10, 0.3, 0.8);
}
- public LinearRegressionAnalysis(String datasetName, String studyName, String depVarName, String indepVarName,
+ public LinearRegressionAnalysis(String studyId, String depVarName, String indepVarName,
int numIterations, double regularization, double elasticNet) {
- this.datasetName = datasetName;
- this.studyName = studyName;
+ super(studyId);
this.depVarName = depVarName;
this.indepVarName = indepVarName;
this.numIterations = numIterations;
diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LogisticRegressionAnalysis.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LogisticRegressionAnalysis.java
index 4b58c3e3..d02a20e7 100644
--- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LogisticRegressionAnalysis.java
+++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/LogisticRegressionAnalysis.java
@@ -70,14 +70,13 @@ public void execute() {
lrModel.setThreshold(bestThreshold);
}
- public LogisticRegressionAnalysis(String datasetName, String studyName, String depVarName, String indepVarName) {
- this(datasetName, studyName, depVarName, indepVarName, 10, 0.3, 0.8);
+ public LogisticRegressionAnalysis(String studyId, String depVarName, String indepVarName) {
+ this(studyId, depVarName, indepVarName, 10, 0.3, 0.8);
}
- public LogisticRegressionAnalysis(String datasetName, String studyName, String depVarName, String indepVarName,
+ public LogisticRegressionAnalysis(String studyId, String depVarName, String indepVarName,
int numIterations, double regularization, double elasticNet) {
- this.datasetName = datasetName;
- this.studyName = studyName;
+ super(studyId);
this.depVarName = depVarName;
this.indepVarName = indepVarName;
this.numIterations = numIterations;
diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/PCAAnalysis.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/PCAAnalysis.java
index 52deb02e..af33e6c9 100644
--- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/PCAAnalysis.java
+++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/PCAAnalysis.java
@@ -29,13 +29,12 @@ public void execute() {
result.show(false);
}
- public PCAAnalysis(String datasetName, String studyName, String featureName) {
- this(datasetName, studyName, featureName, 3);
+ public PCAAnalysis(String studyId, String studyName, String featureName) {
+ this(studyId, studyName, featureName, 3);
}
- public PCAAnalysis(String datasetName, String studyName, String featureName, int kValue) {
- this.datasetName = datasetName;
- this.studyName = studyName;
+ public PCAAnalysis(String studyId, String studyName, String featureName, int kValue) {
+ super(studyId);
this.featureName = featureName;
this.kValue = kValue;
}
diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/RvTestsAdaptor.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/RvTestsAdaptor.java
deleted file mode 100644
index df9d9f07..00000000
--- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/RvTestsAdaptor.java
+++ /dev/null
@@ -1,468 +0,0 @@
-package org.opencb.hpg.bigdata.analysis.variant;
-
-import org.apache.spark.SparkConf;
-import org.apache.spark.SparkContext;
-import org.apache.spark.api.java.JavaPairRDD;
-import org.apache.spark.api.java.JavaRDD;
-import org.apache.spark.api.java.function.Function;
-import org.apache.spark.sql.Dataset;
-import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SparkSession;
-import org.opencb.biodata.formats.pedigree.PedigreeManager;
-import org.opencb.biodata.models.core.Region;
-import org.opencb.biodata.models.core.pedigree.Pedigree;
-import org.opencb.biodata.tools.variant.metadata.VariantMetadataManager;
-import org.opencb.commons.utils.FileUtils;
-import org.opencb.hpg.bigdata.analysis.AnalysisExecutor;
-import org.opencb.hpg.bigdata.analysis.AnalysisExecutorException;
-import org.opencb.hpg.bigdata.core.lib.SparkConfCreator;
-import org.opencb.hpg.bigdata.core.lib.VariantDataset;
-import scala.Tuple2;
-import scala.collection.mutable.StringBuilder;
-import scala.collection.mutable.WrappedArray;
-
-import java.io.*;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.*;
-
-/**
- * Created by joaquin on 1/19/17.
- */
-public class RvTestsAdaptor extends AnalysisExecutor implements Serializable {
- private String inFilename;
- private String metaFilename;
- private String outDirname;
- private String confFilename;
-
- private final String RVTEST_BIN = "rvtests"; //"/home/jtarraga/soft/rvtests/executable/rvtest";
- private final String BGZIP_BIN = "bgzip"; //"/home/joaquin/softs/htslib/bgzip";
- private final String TABIX_BIN = "tabix"; //"/home/joaquin/softs/htslib/tabix";
-
- public RvTestsAdaptor(String inFilename, String outDirname, String confFilename) {
- this(inFilename, inFilename + ".meta.json", outDirname, confFilename);
- }
-
- public RvTestsAdaptor(String inFilename, String metaFilename, String outDirname, String confFilename) {
- this.inFilename = inFilename;
- this.metaFilename = metaFilename;
- this.outDirname = outDirname;
- this.confFilename = confFilename;
- }
-
-// ./build/bin/hpg-bigdata-local2.sh variant rvtests -i ~/data/vcf/skat/example.vcf.avro -o ~/data/vcf/skat/out
-// --dataset noname -c ~/data/vcf/skat/skat.params
-
- public void run(String datasetName) throws Exception {
- // create spark session
- SparkConf sparkConf;
- if (inFilename.startsWith("/")) {
- sparkConf = SparkConfCreator.getConf("variant rvtests", "local", 1, true);
- } else {
- sparkConf = new SparkConf().setAppName("variant rvtests");
- }
- SparkSession sparkSession = new SparkSession(new SparkContext(sparkConf));
-
- // load dataset
- VariantDataset vd = new VariantDataset(sparkSession);
- vd.load(inFilename);
- vd.createOrReplaceTempView("vcf");
-
- // load rvtests parameters into properties
- Properties props = new Properties();
- InputStream confStream = new FileInputStream(confFilename);
- props.load(confStream);
- confStream.close();
-
- for (Object key: props.keySet()) {
- System.out.println((String) key + " = " + (String) props.get(key));
- }
-
- List kernels = null;
- String kernel = props.getProperty("kernel");
- if (kernel != null) {
- kernels = Arrays.asList(kernel.toLowerCase()
- .replace("skato", "SkatO")
- .replace("skat", "Skat")
- .split(","));
- }
-
- // create temporary directory
- File tmpDir = new File(outDirname + "/tmp");
- tmpDir.mkdir();
-
- // create temporary file for --pheno
- File phenoFile = new File(tmpDir.getAbsolutePath() + "/pheno");
- VariantMetadataManager metadataManager = new VariantMetadataManager();
- metadataManager.load(Paths.get(metaFilename));
- Pedigree pedigree = metadataManager.getPedigree(datasetName);
- new PedigreeManager().save(pedigree, phenoFile.toPath());
-
- // loop for regions
- String line;
- BufferedWriter writer = null;
- BufferedReader reader = FileUtils.newBufferedReader(Paths.get(props.getProperty("setFile")));
- int i = 0;
- StringBuilder sb = new StringBuilder();
- while ((line = reader.readLine()) != null) {
- String[] fields = line.split("[\t ]");
- System.out.println(fields[0]);
- Region region = new Region(fields[1]);
-
- // create temporary files for --inVcf and --setFile
- File setFile = new File(tmpDir.getAbsolutePath() + "/setFile." + i);
- writer = FileUtils.newBufferedWriter(setFile.toPath());
- writer.write(fields[0] + "\t" + fields[1] + "\n");
- writer.close();
-
- // create temporary vcf file fot the region variants
- VariantDataset ds = (VariantDataset) vd.regionFilter(region);
- File vcfFile = new File(tmpDir.getAbsolutePath() + "/variants." + i + ".vcf");
- writer = FileUtils.newBufferedWriter(vcfFile.toPath());
-
- // header lines
- writer.write("##fileformat=VCFv4.2\n");
- writer.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
- // sample names
- for (String key: pedigree.getIndividuals().keySet()) {
- writer.write("\t");
- writer.write(pedigree.getIndividuals().get(key).getId());
- }
- writer.write("\n");
-
- // variant lines
-// writer.write(i + " >>>>>>>>>>>>>> " + line + ", num. rows = " + ds.count() + "\n");
-// writer.close();
-// System.out.println("\n\n" + i + " >>>>>>>>>>>>>> " + line + ", num. rows = " + ds.count() + "\n");
-// ds.show(1, false);
-// ds.reset();
-
- //System.out.println("----------> !!!! rows should be sorted by start !!!" + ds.getSql());
-// ds.executeSql(ds.getSql()).show();
-// System.out.println("----------> rows should be sorted by start !!!");
-// ds.executeSql("SELECT * FROM vcf WHERE (chromosome = '22' AND start >= 32755892 AND end <= 32767063)"
-// + " ORDER BY start ASC").show();
-// System.exit(0);
-
- //List rows = ds.collectAsList();
- List rows = ds.executeSql(ds.getSql()).collectAsList();
- ds.reset();
-
- for (Row row: rows) {
- List studies = row.getList(12);
- Row study = null;
- for (int j = 0; j < studies.size(); j++) {
- if (studies.get(j).get(0).equals(datasetName)) {
- study = studies.get(j);
- break;
- }
- }
-
- if (study == null) {
- throw new Exception("Dataset '" + datasetName + "' not found!");
- }
-
-// System.out.println("\n\n\n" + row.mkString() + "\n\n");
- List files = study.getList(1);
- String filter = (String) files.get(0).getJavaMap(2).get("FILTER");
- String qual = (String) files.get(0).getJavaMap(2).get("QUAL");
-
- sb.setLength(0);
- sb.append(row.get(2)).append("\t").append(row.get(3)).append("\t")
- .append(row.get(0) == null ? "." : row.get(0)).append("\t")
- .append(row.get(5)).append("\t").append(row.get(6)).append("\t").append(qual).append("\t")
- .append(filter).append("\t").append(".").append("\t").append(study.getList(3).get(0));
-
- int j = 0;
- try {
-// System.out.println("size for (" + i + "): " + sb.toString());
-// System.out.println("\tsize = " + study.getList(4).size());
- for (j = 0; j < pedigree.getIndividuals().size(); j++) {
-// System.out.println(j + ": ");
-// System.out.println("\t" + ((WrappedArray) study.getList(4).get(j)).head());
- sb.append("\t").append(((WrappedArray) study.getList(4).get(j)).head());
- }
- } catch (Exception ex) {
- System.out.println("Exception for sample data: " + sb.toString());
- System.out.println("(i, j, sample data size, pedigree size) = ("
- + i + ", " + j + ", " + study.getList(4).size() + ", " + pedigree.getIndividuals().size());
- System.out.println(ex.getMessage());
- ex.printStackTrace();
- }
- sb.append("\n");
- writer.write(sb.toString());
-// System.out.println(sb.toString());
- }
- writer.close();
-
- // compress vcf to bgz
- sb.setLength(0);
- sb.append(props.getProperty("bgzip", this.BGZIP_BIN)).append(" ").append(vcfFile.getAbsolutePath());
- execute(sb.toString(), "Compressing vcf to gz: " + sb.toString());
-
- // and create tabix index
- sb.setLength(0);
- sb.append(props.getProperty("tabix", this.TABIX_BIN)).append(" -p vcf ").append(vcfFile.getAbsolutePath()).append(".gz");
- execute(sb.toString(), "Creating tabix index: " + sb);
-
- // rvtests command line
- sb.setLength(0);
- sb.append(props.getProperty("rvtests", this.RVTEST_BIN))
- .append(kernel == null ? " " : " --kernel " + kernel)
- .append(" --pheno ").append(phenoFile.getAbsolutePath())
- .append(" --inVcf ").append(vcfFile.getAbsolutePath()).append(".gz")
- .append(" --setFile ").append(setFile.getAbsolutePath())
- .append(" --out ").append(tmpDir.getAbsolutePath()).append("/out.").append(i);
- execute(sb.toString(), "Running rvtests: " + sb);
-
- i++;
- }
-
- // write final
- for (String k: kernels) {
- boolean header = false;
- writer = FileUtils.newBufferedWriter(Paths.get(outDirname + "/out." + k + ".assoc"));
- for (int j = 0; j < i; j++) {
- File file = new File(tmpDir.getAbsolutePath() + "/out." + j + "." + k + ".assoc");
- if (file.exists()) {
- reader = FileUtils.newBufferedReader(file.toPath());
- line = reader.readLine();
- if (!header) {
- writer.write(line);
- writer.write("\n");
- header = true;
- }
- while ((line = reader.readLine()) != null) {
- writer.write(line);
- writer.write("\n");
- }
- reader.close();
- }
- }
- writer.close();
- }
- }
-
- public void run00(String datasetName) throws Exception {
- // create spark session
- SparkConf sparkConf;
- if (inFilename.startsWith("/")) {
- sparkConf = SparkConfCreator.getConf("variant rvtests", "local", 1, true);
- } else {
- sparkConf = new SparkConf().setAppName("variant rvtests");
- }
- SparkSession sparkSession = new SparkSession(new SparkContext(sparkConf));
-
- // load dataset
- VariantDataset vd = new VariantDataset(sparkSession);
- vd.load(inFilename);
- vd.createOrReplaceTempView("vcf");
-
- // load rvtests parameters into properties
- Properties props = new Properties();
- InputStream confStream = new FileInputStream(confFilename);
- props.load(confStream);
- confStream.close();
-
- List kernels = null;
- String kernel = props.getProperty("kernel");
- if (kernel != null) {
- kernels = Arrays.asList(kernel.toLowerCase()
- .replace("skato", "SkatO")
- .replace("skat", "Skat")
- .split(","));
- }
-
- // create temporary directory
- File tmpDir = new File(outDirname + "/tmp");
- tmpDir.mkdir();
-
- // create temporary file for --pheno
- File phenoFile = new File(tmpDir.getAbsolutePath() + "/pheno");
- VariantMetadataManager metadataManager = new VariantMetadataManager();
- metadataManager.load(Paths.get(metaFilename));
- Pedigree pedigree = metadataManager.getPedigree(datasetName);
- new PedigreeManager().save(pedigree, phenoFile.toPath());
-
- Map pedigreeMap = new LinkedHashMap<>();
- for (String key : pedigree.getIndividuals().keySet()) {
- pedigreeMap.put(key, pedigree.getIndividuals().get(key).getId());
- }
-
- String sql = "SELECT * FROM vcf";
- Dataset rows = vd.sqlContext().sql(sql);
-
- List regions = new ArrayList<>();
- regions.add("22:40766594-40806293");
- regions.add("22:32755892-32767063");
- regions.add("22:32755894-32766972");
- regions.add("22:29834571-29838444");
- regions.add("22:31835344-31885547");
- regions.add("22:31608249-31676066");
- regions.add("22:38615297-38668670");
- regions.add("22:38822332-38851203");
- regions.add("22:20455993-20461786");
-
- JavaPairRDD> groups = rows.javaRDD().groupBy(row -> {
- String key = null;
- for (String region: regions) {
- String[] fields = region.split("[:-]");
- if (fields[0].equals(row.getString(2))
- && Integer.parseInt(fields[1]) <= row.getInt(4)
- && Integer.parseInt(fields[2]) >= row.getInt(3)) {
- key = region;
- break;
- }
- }
- return key;
- });
-
- JavaRDD results = groups.map(new Function>, String>() {
- @Override
- public String call(Tuple2> tuple) throws Exception {
- StringBuilder sb = new StringBuilder();
- BufferedWriter writer;
- Iterator iterator = tuple._2.iterator();
-// System.out.println("+++++++++++++ " + tuple._1);
-
- String id = tuple._1.replace(":", "_").replace("-", "_");
-
- // create temporary files for --inVcf and --setFile
- File setFile = new File(tmpDir.getAbsolutePath() + "/setFile." + id);
- writer = FileUtils.newBufferedWriter(setFile.toPath());
- writer.write(id + "\t" + tuple._1 + "\n");
- writer.close();
-
- File vcfFile = new File(tmpDir.getAbsolutePath() + "/variants."
- + id + ".vcf");
- writer = FileUtils.newBufferedWriter(vcfFile.toPath());
- // header lines
- writer.write("##fileformat=VCFv4.2\n");
- writer.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT");
- // sample names
- for (String key : pedigreeMap.keySet()) {
- writer.write("\t");
- writer.write(pedigreeMap.get(key));
- }
- writer.write("\n");
- while (iterator.hasNext()) {
- Row row = iterator.next();
-// System.out.println("=================== " + row);
-
- // variant lines
- List studies = row.getList(12);
- Row study = null;
- for (int j = 0; j < studies.size(); j++) {
- if (studies.get(j).get(0).equals(datasetName)) {
- study = studies.get(j);
- break;
- }
- }
-
- if (study == null) {
- throw new Exception("Dataset '" + datasetName + "' not found!");
- }
- List files = study.getList(1);
- String filter = (String) files.get(0).getJavaMap(2).get("FILTER");
- String qual = (String) files.get(0).getJavaMap(2).get("QUAL");
-
- sb.setLength(0);
- sb.append(row.get(2)).append("\t").append(row.get(3)).append("\t")
- .append(row.get(0) == null ? "." : row.get(0)).append("\t")
- .append(row.get(5)).append("\t").append(row.get(6)).append("\t").append(qual).append("\t")
- .append(filter).append("\t").append(".").append("\t").append(study.getList(3).get(0));
-
- for (int j = 0; j < pedigreeMap.size(); j++) {
- sb.append("\t").append(((WrappedArray) study.getList(4).get(j)).head());
- }
- sb.append("\n");
- writer.write(sb.toString());
- System.out.println(sb.toString());
- }
- writer.close();
-
- // compress vcf to bgz
- sb.setLength(0);
- sb.append(props.getProperty("bgzip")).append(" ").append(vcfFile.getAbsolutePath());
- execute(sb.toString(), "Compressing vcf to gz: " + sb);
-
- // and create tabix index
- sb.setLength(0);
- sb.append(props.getProperty("tabix")).append(" -p vcf ").append(vcfFile.getAbsolutePath()).append(".gz");
- execute(sb.toString(), "Creating tabix index: " + sb);
-
- // rvtests command line
- sb.setLength(0);
- sb.append(props.getProperty("rvtests"))
- .append(kernel == null ? " " : " --kernel " + kernel)
- .append(" --pheno ").append(phenoFile.getAbsolutePath())
- .append(" --inVcf ").append(vcfFile.getAbsolutePath()).append(".gz")
- .append(" --setFile ").append(setFile.getAbsolutePath())
- .append(" --out ").append(tmpDir.getAbsolutePath()).append("/out.").append(id);
- execute(sb.toString(), "Running rvtests: " + sb);
-
- sb.setLength(0);
- File file = new File(tmpDir.getAbsolutePath() + "/out." + id + ".Skat.assoc");
- if (file.exists()) {
- BufferedReader reader = FileUtils.newBufferedReader(file.toPath());
- String line = reader.readLine();
- while ((line = reader.readLine()) != null) {
- sb.append(line).append("\n");
- }
- reader.close();
- }
-
- return sb.toString();
- }
- });
-
- System.out.println(results.collect());
- }
-
- private Process execute(String cmdline, String label) {
- Process p = null;
- try {
- System.out.println(label);
- System.out.println("Executing: " + cmdline);
- p = Runtime.getRuntime().exec(cmdline);
-
- System.out.println("\tSTDOUT:");
- System.out.println(readInputStream(p.getInputStream()));
- System.out.println("\tSTDERR:");
- System.out.println(readInputStream(p.getErrorStream()));
- } catch (Exception e) {
- e.printStackTrace();
- }
- return p;
- }
-
- private void saveStream(InputStream inputStream, Path path) {
- try {
- PrintWriter writer = new PrintWriter(path.toFile());
- writer.write(readInputStream(inputStream));
- writer.close();
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
-
- private String readInputStream(InputStream inputStream) {
- StringBuilder res = new StringBuilder();
- try {
- // read the output from the command
- BufferedReader stdInput = new BufferedReader(new InputStreamReader(inputStream));
- String s;
- System.out.println("Here is the standard output of the command:\n");
- while ((s = stdInput.readLine()) != null) {
- res.append(s).append("\n");
- }
- } catch (IOException e) {
- res.append(e.getMessage()).append("\n");
- }
- return res.toString();
- }
-
- @Override
- public void execute() throws AnalysisExecutorException {
- }
-}
diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisExecutor.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisExecutor.java
index 6e3fa4e0..ad9f8891 100644
--- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisExecutor.java
+++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisExecutor.java
@@ -7,13 +7,7 @@
*/
public abstract class VariantAnalysisExecutor extends AnalysisExecutor {
- protected String studyName;
-
- public String getStudyName() {
- return studyName;
- }
-
- public void setStudyName(String studyName) {
- this.studyName = studyName;
+ protected VariantAnalysisExecutor(String studyId) {
+ super(studyId);
}
}
diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisUtils.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisUtils.java
new file mode 100644
index 00000000..e110b568
--- /dev/null
+++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/VariantAnalysisUtils.java
@@ -0,0 +1,287 @@
+package org.opencb.hpg.bigdata.analysis.variant;
+
+import htsjdk.variant.variantcontext.writer.Options;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.spark.SparkConf;
+import org.apache.spark.SparkContext;
+import org.apache.spark.sql.SparkSession;
+import org.opencb.biodata.formats.pedigree.PedigreeManager;
+import org.opencb.biodata.models.core.Region;
+import org.opencb.biodata.models.core.pedigree.Pedigree;
+import org.opencb.biodata.models.metadata.Sample;
+import org.opencb.biodata.models.variant.metadata.VariantStudyMetadata;
+import org.opencb.biodata.tools.variant.converters.VCFExporter;
+import org.opencb.biodata.tools.variant.metadata.VariantMetadataManager;
+import org.opencb.commons.datastore.core.Query;
+import org.opencb.hpg.bigdata.core.lib.SparkConfCreator;
+import org.opencb.hpg.bigdata.core.lib.VariantDataset;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static java.nio.file.Paths.get;
+
+public class VariantAnalysisUtils {
+ public static final Pattern OPERATION_PATTERN = Pattern.compile("([^=<>~!]+)(.*)$");
+
+ public static void exportVCF(String inputAvroFilename, String metaFilename,
+ FilterParameters filterParams, String vcfFilename) {
+ // Generate VCF file by calling VCF exporter from query and query options
+ VariantMetadataManager manager = new VariantMetadataManager();
+ try {
+ manager.load(Paths.get(metaFilename));
+
+ SparkConf sparkConf = SparkConfCreator.getConf("tool plink", "local", 1, true);
+ SparkSession sparkSession = new SparkSession(new SparkContext(sparkConf));
+
+ VariantDataset vd = new VariantDataset(sparkSession);
+ vd.load(inputAvroFilename);
+ vd.createOrReplaceTempView("vcf");
+
+ // Add filters to variant dataset
+ if (filterParams != null) {
+ addVariantFilters(filterParams, vd);
+ }
+
+ // out filename
+ VariantStudyMetadata studyMetadata = manager.getVariantMetadata().getStudies().get(0);
+ VCFExporter vcfExporter = new VCFExporter(studyMetadata);
+ vcfExporter.open(Options.ALLOW_MISSING_FIELDS_IN_HEADER, Paths.get(vcfFilename));
+
+ vcfExporter.export(vd.iterator());
+
+ // close everything
+ vcfExporter.close();
+ sparkSession.stop();
+ } catch (Exception e) {
+ System.out.println("Error retrieving variants from Avro to VCF file: " + e.getMessage());
+ }
+ }
+
+ public static void addVariantFilters(FilterParameters filterParams, VariantDataset vd) throws IOException {
+ VariantMetadataManager metadataManager = new VariantMetadataManager();
+ //metadataManager.load(Paths.get(variantCommandOptions.queryVariantCommandOptions.input + ".meta.json"));
+
+ // query for ID (list and file)
+ List list = null;
+ if (StringUtils.isNotEmpty(filterParams.ids)) {
+ list = Arrays.asList(StringUtils.split(filterParams.ids, ","));
+ }
+ String idFilename = filterParams.idFilename;
+ if (StringUtils.isNotEmpty(idFilename) && new File(idFilename).exists()) {
+ if (list == null) {
+ list = Files.readAllLines(get(idFilename));
+ } else {
+ list.addAll(Files.readAllLines(get(idFilename)));
+ }
+ }
+ if (list != null) {
+ vd.idFilter(list, false);
+ }
+
+ // query for type
+ if (StringUtils.isNotEmpty(filterParams.types)) {
+ vd.typeFilter(Arrays.asList(
+ StringUtils.split(filterParams.types, ",")));
+ }
+
+ // query for biotype
+ if (StringUtils.isNotEmpty(filterParams.biotypes)) {
+ vd.annotationFilter("biotype", Arrays.asList(
+ StringUtils.split(filterParams.biotypes, ",")));
+ }
+
+ // query for study
+ if (StringUtils.isNotEmpty(filterParams.studies)) {
+ vd.studyFilter("studyId", Arrays.asList(
+ StringUtils.split(filterParams.studies, ",")));
+ }
+
+ // query for maf (study:cohort)
+ if (StringUtils.isNotEmpty(filterParams.maf)) {
+ vd.studyFilter("stats.maf", filterParams.maf);
+ }
+
+ // query for mgf (study:cohort)
+ if (StringUtils.isNotEmpty(filterParams.mgf)) {
+ vd.studyFilter("stats.mgf", filterParams.mgf);
+ }
+
+ // query for number of missing alleles (study:cohort)
+ if (StringUtils.isNotEmpty(filterParams.missingAlleles)) {
+ vd.studyFilter("stats.missingAlleles", filterParams.missingAlleles);
+ }
+
+ // query for number of missing genotypes (study:cohort)
+ if (StringUtils.isNotEmpty(filterParams.missingGenotypes)) {
+ vd.studyFilter("stats.missingGenotypes", filterParams.missingGenotypes);
+ }
+
+ // query for region (list and file)
+ List regions = getRegionList(filterParams.regions,
+ filterParams.regionFilename);
+ if (regions != null && regions.size() > 0) {
+ vd.regionFilter(regions);
+ }
+
+ // query for sample genotypes and/or sample filters
+ StringBuilder sampleGenotypes = new StringBuilder();
+ if (StringUtils.isNotEmpty(filterParams.sampleGenotypes)) {
+ sampleGenotypes.append(filterParams.sampleGenotypes);
+ }
+ String sampleFilters = filterParams.sampleFilters;
+ if (StringUtils.isNotEmpty(sampleGenotypes) || StringUtils.isNotEmpty(sampleFilters)) {
+ // TODO: we need the ID for dataset target
+ List samples = null;
+
+
+ if (StringUtils.isNotEmpty(sampleFilters)) {
+ Query sampleQuery = new Query();
+// final Pattern OPERATION_PATTERN = Pattern.compile("([^=<>~!]+.*)(<=?|>=?|!=|!?=?~|==?)([^=<>~!]+.*)$");
+ String[] splits = sampleFilters.split("[;]");
+ for (int i = 0; i < splits.length; i++) {
+ Matcher matcher = OPERATION_PATTERN.matcher(splits[i]);
+ if (matcher.matches()) {
+ sampleQuery.put(matcher.group(1), matcher.group(2));
+ }
+ }
+
+ samples = metadataManager.getSamples(sampleQuery,
+ metadataManager.getVariantMetadata().getStudies().get(0).getId());
+
+ for (Sample sample : samples) {
+ if (sampleGenotypes.length() > 0) {
+ sampleGenotypes.append(";");
+ }
+ //sampleGenotypes.append(sample.getId()).append(":0|1,1|0,1|1");
+ sampleGenotypes.append(sample.getId()).append(":1|1");
+ }
+ }
+ samples = metadataManager.getSamples(
+ metadataManager.getVariantMetadata().getStudies().get(0).getId());
+
+ // e.g.: sample genotypes = sample1:0|0;sample2:1|0,1|1
+ String[] values = sampleGenotypes.toString().split("[;]");
+ StringBuilder newSampleGenotypes = new StringBuilder();
+ if (values == null) {
+ newSampleGenotypes.append(updateSampleGenotype(sampleGenotypes.toString(), samples));
+ } else {
+ newSampleGenotypes.append(updateSampleGenotype(values[0], samples));
+ for (int i = 1; i < values.length; i++) {
+ newSampleGenotypes.append(";");
+ newSampleGenotypes.append(updateSampleGenotype(values[i], samples));
+ }
+ }
+ if (!StringUtils.isEmpty(newSampleGenotypes)) {
+ vd.sampleFilter("GT", newSampleGenotypes.toString());
+ } else {
+ System.err.format("Error: could not parse your sample genotypes %s.\n", sampleGenotypes);
+ }
+ }
+
+ // query for consequence type (Sequence Ontology term names and accession codes)
+ annotationFilterNotEmpty("consequenceTypes.sequenceOntologyTerms", filterParams.consequenceTypes, vd);
+
+ // query for consequence type (gene names)
+ annotationFilterNotEmpty("consequenceTypes.geneName", filterParams.genes, vd);
+
+ // query for clinvar (accession)
+ annotationFilterNotEmpty("variantTraitAssociation.clinvar.accession", filterParams.clinvar, vd);
+
+ // query for cosmic (mutation ID)
+ annotationFilterNotEmpty("variantTraitAssociation.cosmic.mutationId", filterParams.cosmic, vd);
+
+ // query for conservation (phastCons, phylop, gerp)
+ annotationFilterNotEmpty("conservation", filterParams.conservScores, vd);
+
+ // query for protein substitution scores (polyphen, sift)
+ annotationFilterNotEmpty("consequenceTypes.proteinVariantAnnotation.substitutionScores", filterParams.substScores, vd);
+
+ // query for alternate population frequency (study:population)
+ annotationFilterNotEmpty("populationFrequencies.altAlleleFreq", filterParams.pf, vd);
+
+ // query for population minor allele frequency (study:population)
+ annotationFilterNotEmpty("populationFrequencies.refAlleleFreq", filterParams.pmaf, vd);
+ }
+
+ private static void annotationFilterNotEmpty(String key, String value, VariantDataset vd) {
+ if (StringUtils.isNotEmpty(value)) {
+ vd.annotationFilter(key, value);
+ }
+ }
+
+ public static List getRegionList(String regions, String regionFilename) throws IOException {
+ List list = null;
+ if (StringUtils.isNotEmpty(regions)) {
+ list = Region.parseRegions(regions);
+ }
+ if (StringUtils.isNotEmpty(regionFilename) && new File(regionFilename).exists()) {
+ if (regions == null) {
+ list = new ArrayList<>();
+ }
+ List lines = Files.readAllLines(Paths.get(regionFilename));
+ for (String line : lines) {
+ list.add(new Region(line));
+ }
+ }
+ return list;
+ }
+
+ /**
+ * Update the sample genotype query string by replacing the sample name by
+ * its sample order, e.g.: from sample2:1|0,1|1 to 32:1|0,1|1.
+ *
+ * @param sampleGenotype Sample genotype query string
+ * @param samples Sample list in the right order (to get the sample index)
+ * @return Updated sample genotype query string
+ */
+ private static String updateSampleGenotype(String sampleGenotype, List samples) {
+ // e.g.: value = sample2:1|0,1|1
+ StringBuilder newSampleGenotype = new StringBuilder("");
+ String[] splits = sampleGenotype.split("[:]");
+ if (splits == null) {
+ // error
+ System.err.format("Error: invalid expresion %s for sample genotypes.\n", sampleGenotype);
+ } else {
+ boolean found = false;
+ // TODO: move this functionality to the VariantMetadataManager (from sample name to sample index)
+ for (int i = 0; i < samples.size(); i++) {
+ if (splits[0].equals(samples.get(i).getId())) {
+ newSampleGenotype.append(i).append(":").append(splits[1]);
+ found = true;
+ break;
+ }
+ }
+ // Sanity check
+ if (!found) {
+ // error
+ System.err.format("Error: sample %s not found in dataset.\n", splits[0]);
+ }
+ }
+ System.out.println(sampleGenotype + " -> " + newSampleGenotype.toString());
+ return newSampleGenotype.toString();
+ }
+
+ /**
+ * Export variant metadata into pedigree file format.
+ *
+ * @param metaFilename Variant metadata file name
+ * @param studyId Study ID target
+ * @param pedFilename Pedigree file name
+ * @throws IOException IO exception
+ */
+ public static void exportPedigree(String metaFilename, String studyId, String pedFilename) throws IOException {
+ VariantMetadataManager metadataManager = new VariantMetadataManager();
+ metadataManager.load(Paths.get(metaFilename));
+ Pedigree pedigree = metadataManager.getPedigree(studyId);
+ PedigreeManager pedigreeManager = new PedigreeManager();
+ pedigreeManager.save(pedigree, Paths.get(pedFilename));
+ }
+}
diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/tools/adaptors/PlinkAdaptor.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/PlinkWrapper.java
similarity index 89%
rename from hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/tools/adaptors/PlinkAdaptor.java
rename to hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/PlinkWrapper.java
index 6579b403..6457b0d4 100644
--- a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/tools/adaptors/PlinkAdaptor.java
+++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/PlinkWrapper.java
@@ -1,4 +1,4 @@
-package org.opencb.hpg.bigdata.analysis.tools.adaptors;
+package org.opencb.hpg.bigdata.analysis.variant.wrappers;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectReader;
@@ -14,6 +14,7 @@
import org.opencb.hpg.bigdata.analysis.tools.Executor;
import org.opencb.hpg.bigdata.analysis.tools.Status;
import org.opencb.hpg.bigdata.analysis.tools.ToolManager;
+import org.opencb.hpg.bigdata.analysis.variant.VariantAnalysisExecutor;
import org.opencb.hpg.bigdata.core.lib.SparkConfCreator;
import org.opencb.hpg.bigdata.core.lib.VariantDataset;
import org.slf4j.Logger;
@@ -24,19 +25,20 @@
import java.util.HashMap;
import java.util.Map;
-public class PlinkAdaptor {
+public class PlinkWrapper extends VariantAnalysisWrapper {
private Query query;
private QueryOptions queryOptions;
private Logger logger;
- public PlinkAdaptor() {
- this.logger = LoggerFactory.getLogger(PlinkAdaptor.class);
+ public PlinkWrapper(String studyId) {
+ super(studyId);
+ this.logger = LoggerFactory.getLogger(PlinkWrapper.class);
}
- public void run() {
+ public void execute() {
String inputAvroFilename = query.getString("input");
String tmpVcfFilename = query.get("outdir") + "/tmp.vcf";
@@ -83,7 +85,7 @@ public void run() {
//params.put("output", "/tmp/test.bam.bai");
String commandLine = toolManager.createCommandLine("plink", "index", params);
- System.out.println(commandLine); Path tmp = Paths.get("/tmp");
+ System.out.println(commandLine);
Executor.execute(commandLine, tmp, true);
diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/RvTestsWrapper.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/RvTestsWrapper.java
new file mode 100644
index 00000000..f61c728f
--- /dev/null
+++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/RvTestsWrapper.java
@@ -0,0 +1,95 @@
+package org.opencb.hpg.bigdata.analysis.variant.wrappers;
+
+import org.opencb.hpg.bigdata.analysis.AnalysisExecutorException;
+import org.opencb.hpg.bigdata.analysis.exceptions.AnalysisToolException;
+import org.opencb.hpg.bigdata.analysis.tools.Executor;
+import org.opencb.hpg.bigdata.analysis.variant.FilterParameters;
+import org.opencb.hpg.bigdata.analysis.variant.VariantAnalysisExecutor;
+import org.opencb.hpg.bigdata.analysis.variant.VariantAnalysisUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Map;
+
+/**
+ * Created by joaquin on 1/19/17.
+ */
+//public class RvTestsWrapper extends AnalysisExecutor implements Serializable {
+public class RvTestsWrapper extends VariantAnalysisWrapper { // extends AnalysisExecutor implements Serializable {
+ private String inFilename;
+ private String metaFilename;
+ private FilterParameters filterParams;
+ private Map rvtestsParams;
+
+ private Logger logger;
+
+ public RvTestsWrapper(String studyId, String inFilename, String metaFilename,
+ FilterParameters filterParams, Map rvtestsParams) {
+ super(studyId);
+ this.inFilename = inFilename;
+ this.metaFilename = metaFilename;
+ this.filterParams = filterParams;
+ this.rvtestsParams = rvtestsParams;
+
+ this.logger = LoggerFactory.getLogger(PlinkWrapper.class);
+ }
+
+ @Override
+ public void execute() throws AnalysisExecutorException {
+ // Sanity chek
+ if (binPath == null || !binPath.toFile().exists()) {
+ String msg = "RvTests binary path is missing.";
+ logger.error("RvTests binary path is missing.");
+ throw new AnalysisExecutorException("RvTests binary path is missing.");
+ }
+
+ // Get output dir
+ Path outDir = Paths.get("/tmp");
+ if (rvtestsParams.get("out") != null) {
+ outDir = Paths.get(rvtestsParams.get("out")).getParent();
+ }
+
+ // Export target variants to VCF file
+ String vcfFilename = outDir.toString() + "/tmp.vcf";
+ VariantAnalysisUtils.exportVCF(inFilename, metaFilename, filterParams, vcfFilename);
+
+ // Export pedigree
+ String pedFilename = outDir.toString() + "/tmp.vcf.ped";
+ try {
+ VariantAnalysisUtils.exportPedigree(metaFilename, studyId, pedFilename);
+ } catch (IOException e) {
+ logger.error(e.getMessage());
+ throw new AnalysisExecutorException(e);
+ }
+
+ StringBuilder sb = new StringBuilder();
+/*
+ // Compress vcf to bgz
+ sb.setLength(0);
+ sb.append(BGZIP_BIN).append(" ").append(vcfFilename);
+ Executor.execute(sb.toString(), outDir, true);
+
+ // ...create tabix index
+ sb.setLength(0);
+ sb.append(TABIX_BIN).append(" -p vcf ").append(vcfFilename).append(".gz");
+ Executor.execute(sb.toString(), outDir, true);
+*/
+ // ...and finally, run rvtests
+ sb.setLength(0);
+ sb.append(binPath);
+ sb.append(" --inVcf ").append(vcfFilename).append(".gz");
+ sb.append(" --pheno ").append(pedFilename);
+ for (String key: rvtestsParams.keySet()) {
+ sb.append(" --").append(key).append(" ").append(rvtestsParams.get(key));
+ }
+ try {
+ Executor.execute(sb.toString(), outDir, true);
+ } catch (AnalysisToolException e) {
+ logger.error(e.getMessage());
+ throw new AnalysisExecutorException(e);
+ }
+ }
+}
diff --git a/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/VariantAnalysisWrapper.java b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/VariantAnalysisWrapper.java
new file mode 100644
index 00000000..c1d1228b
--- /dev/null
+++ b/hpg-bigdata-analysis/src/main/java/org/opencb/hpg/bigdata/analysis/variant/wrappers/VariantAnalysisWrapper.java
@@ -0,0 +1,22 @@
+package org.opencb.hpg.bigdata.analysis.variant.wrappers;
+
+import org.opencb.hpg.bigdata.analysis.variant.VariantAnalysisExecutor;
+
+import java.nio.file.Path;
+
+public abstract class VariantAnalysisWrapper extends VariantAnalysisExecutor {
+
+ protected Path binPath;
+
+ protected VariantAnalysisWrapper(String studyId) {
+ super(studyId);
+ }
+
+ public void setBinPath(Path binPath) {
+ this.binPath = binPath;
+ }
+
+ public Path getBinPath() {
+ return binPath;
+ }
+}
diff --git a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/CliUtils.java b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/CliUtils.java
index 37d2c702..75e06a02 100644
--- a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/CliUtils.java
+++ b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/CliUtils.java
@@ -43,6 +43,7 @@ public static String getOutputFilename(String input, String output, String to) t
return res;
}
+ @Deprecated
public static List getRegionList(String regions, String regionFilename) throws IOException {
List list = null;
if (StringUtils.isNotEmpty(regions)) {
@@ -97,6 +98,7 @@ public static void saveDatasetAsOneFile(ParentDataset ds, String format, String
dir.delete();
}
+ @Deprecated
public static void addVariantFilters(VariantCommandOptions variantCommandOptions,
VariantDataset vd) throws IOException {
@@ -253,6 +255,7 @@ public static void addVariantFilters(VariantCommandOptions variantCommandOptions
variantCommandOptions.queryVariantCommandOptions.pmaf, vd);
}
+ @Deprecated
private static void annotationFilterNotEmpty(String key, String value, VariantDataset vd) {
if (StringUtils.isNotEmpty(value)) {
vd.annotationFilter(key, value);
@@ -282,6 +285,7 @@ public static VariantCommandOptions createVariantCommandOptions(
* @param samples Sample list in the right order (to get the sample index)
* @return Updated sample genotype query string
*/
+ @Deprecated
private static String updateSampleGenotype(String sampleGenotype, List samples) {
// e.g.: value = sample2:1|0,1|1
StringBuilder newSampleGenotype = new StringBuilder("");
diff --git a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/LocalCliOptionsParser.java b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/LocalCliOptionsParser.java
index fe207e79..c31221c5 100644
--- a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/LocalCliOptionsParser.java
+++ b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/LocalCliOptionsParser.java
@@ -20,6 +20,7 @@
import com.beust.jcommander.Parameter;
import com.beust.jcommander.ParameterDescription;
import com.beust.jcommander.ParameterException;
+import org.opencb.hpg.bigdata.analysis.variant.FilterParameters;
import org.opencb.hpg.bigdata.app.cli.local.options.*;
import java.util.Map;
@@ -35,6 +36,7 @@ public class LocalCliOptionsParser {
private final CommandOptions commandOptions;
private final CommonCommandOptions commonCommandOptions;
+ private final FilterParameters filterOptions;
private AdminCommandOptions adminCommandOptions;
@@ -55,6 +57,7 @@ public LocalCliOptionsParser() {
commandOptions = new CommandOptions();
commonCommandOptions = new CommonCommandOptions();
+ filterOptions = new FilterParameters();
adminCommandOptions = new AdminCommandOptions(commonCommandOptions, jcommander);
jcommander.addCommand("admin", adminCommandOptions);
@@ -77,7 +80,7 @@ public LocalCliOptionsParser() {
alignmentSubCommands.addCommand("coverage", alignmentCommandOptions.coverageAlignmentCommandOptions);
alignmentSubCommands.addCommand("query", alignmentCommandOptions.queryAlignmentCommandOptions);
- variantCommandOptions = new VariantCommandOptions(commonCommandOptions, jcommander);
+ variantCommandOptions = new VariantCommandOptions(commonCommandOptions, filterOptions, jcommander);
jcommander.addCommand("variant", variantCommandOptions);
JCommander variantSubCommands = jcommander.getCommands().get("variant");
variantSubCommands.addCommand("convert", variantCommandOptions.convertVariantCommandOptions);
diff --git a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/executors/VariantCommandExecutor.java b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/executors/VariantCommandExecutor.java
index 3abe2138..940e3753 100644
--- a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/executors/VariantCommandExecutor.java
+++ b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/executors/VariantCommandExecutor.java
@@ -31,10 +31,10 @@
import org.opencb.biodata.models.metadata.SampleSetType;
import org.opencb.biodata.models.variant.avro.StudyEntry;
import org.opencb.biodata.models.variant.avro.VariantAvro;
-import org.opencb.biodata.models.variant.avro.VariantFileMetadata;
+import org.opencb.biodata.models.variant.metadata.VariantFileMetadata;
import org.opencb.biodata.tools.variant.metadata.VariantMetadataManager;
import org.opencb.commons.utils.FileUtils;
-import org.opencb.hpg.bigdata.analysis.variant.RvTestsAdaptor;
+import org.opencb.hpg.bigdata.analysis.variant.wrappers.RvTestsWrapper;
import org.opencb.hpg.bigdata.app.cli.CommandExecutor;
import org.opencb.hpg.bigdata.app.cli.local.CliUtils;
import org.opencb.hpg.bigdata.app.cli.local.options.VariantCommandOptions;
@@ -53,7 +53,7 @@
import static java.nio.file.Paths.get;
-//import org.opencb.hpg.bigdata.analysis.variant.analysis.RvTestsAdaptor;
+//import org.opencb.hpg.bigdata.analysis.variant.analysis.RvTestsWrapper;
/**
* Created by imedina on 25/06/15.
@@ -193,7 +193,7 @@ private void convert() throws IOException {
ObjectMapper mapper = new ObjectMapper();
//mapper.setSerializationInclusion(JsonInclude.Include.NON_NULL);
VariantFileMetadata data = mapper.readValue(metaFile, VariantFileMetadata.class);
- data.setFileName(outMetaFile.toString());
+ data.setId(outMetaFile.toString());
// write the metadata
PrintWriter writer = new PrintWriter(new FileOutputStream(outMetaFile));
@@ -371,6 +371,9 @@ public void metadata() throws Exception {
if (variantCommandOptions.metadataVariantCommandOptions.loadPedFilename != null) {
Pedigree pedigree = new PedigreeManager().parse(
get(variantCommandOptions.metadataVariantCommandOptions.loadPedFilename));
+
+ System.out.println(pedigree.toString());
+
metadataManager.loadPedigree(pedigree, datasetId);
updated = true;
}
@@ -418,12 +421,14 @@ public void metadata() throws Exception {
public void rvtests() throws Exception {
- RvTestsAdaptor rvtests = new RvTestsAdaptor(variantCommandOptions.rvtestsVariantCommandOptions.inFilename,
- variantCommandOptions.rvtestsVariantCommandOptions.metaFilename,
- variantCommandOptions.rvtestsVariantCommandOptions.outDirname,
- variantCommandOptions.rvtestsVariantCommandOptions.confFilename);
-
-// rvtests.run(variantCommandOptions.rvtestsVariantCommandOptions.datasetId);
- rvtests.run00(variantCommandOptions.rvtestsVariantCommandOptions.datasetId);
+ RvTestsWrapper rvtests = new RvTestsWrapper(variantCommandOptions.rvtestsVariantCommandOptions.datasetId,
+ variantCommandOptions.rvtestsVariantCommandOptions.inFilename,
+ variantCommandOptions.rvtestsVariantCommandOptions.inFilename + ".meta.json",
+ variantCommandOptions.rvtestsVariantCommandOptions.filterParameters,
+ variantCommandOptions.rvtestsVariantCommandOptions.rvtestsParams);
+
+ // TODO: get the binary path from config folder
+ rvtests.setBinPath(Paths.get("/home/jtarraga/soft/rvtests/executable/rvtest"));
+ rvtests.execute();
}
}
diff --git a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/options/VariantCommandOptions.java b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/options/VariantCommandOptions.java
index 67a5b160..1b299923 100644
--- a/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/options/VariantCommandOptions.java
+++ b/hpg-bigdata-app/src/main/java/org/opencb/hpg/bigdata/app/cli/local/options/VariantCommandOptions.java
@@ -1,12 +1,13 @@
package org.opencb.hpg.bigdata.app.cli.local.options;
-import com.beust.jcommander.JCommander;
-import com.beust.jcommander.Parameter;
-import com.beust.jcommander.Parameters;
-import com.beust.jcommander.ParametersDelegate;
+import com.beust.jcommander.*;
import org.apache.parquet.hadoop.ParquetWriter;
+import org.opencb.hpg.bigdata.analysis.variant.FilterParameters;
import org.opencb.hpg.bigdata.app.cli.local.LocalCliOptionsParser;
+import java.util.HashMap;
+import java.util.Map;
+
/**
* Created by jtarraga on 01/06/17.
*/
@@ -23,6 +24,7 @@ public class VariantCommandOptions {
public RvTestsVariantCommandOptions rvtestsVariantCommandOptions;
public LocalCliOptionsParser.CommonCommandOptions commonCommandOptions;
+ public FilterParameters commonFilterOptions;
public JCommander jCommander;
public VariantCommandOptions(LocalCliOptionsParser.CommonCommandOptions commonCommandOptions,
@@ -38,6 +40,21 @@ public VariantCommandOptions(LocalCliOptionsParser.CommonCommandOptions commonCo
this.rvtestsVariantCommandOptions = new RvTestsVariantCommandOptions();
}
+ public VariantCommandOptions(LocalCliOptionsParser.CommonCommandOptions commonCommandOptions,
+ FilterParameters commonFilterOptions,
+ JCommander jCommander) {
+ this.commonCommandOptions = commonCommandOptions;
+ this.commonFilterOptions = commonFilterOptions;
+ this.jCommander = jCommander;
+
+ this.convertVariantCommandOptions = new ConvertVariantCommandOptions();
+ this.annotateVariantCommandOptions = new AnnotateVariantCommandOptions();
+ this.viewVariantCommandOptions = new ViewVariantCommandOptions();
+ this.queryVariantCommandOptions = new QueryVariantCommandOptions();
+ this.metadataVariantCommandOptions = new MetadataVariantCommandOptions();
+ this.rvtestsVariantCommandOptions = new RvTestsVariantCommandOptions();
+ }
+
@Parameters(commandNames = {"convert"}, commandDescription = "Convert gVCF/VCF files to different big data"
+ " formats such as Avro and Parquet using OpenCB models")
public class ConvertVariantCommandOptions {
@@ -331,10 +348,16 @@ public class RvTestsVariantCommandOptions {
@ParametersDelegate
public LocalCliOptionsParser.CommonCommandOptions commonOptions = commonCommandOptions;
+ @ParametersDelegate
+ public FilterParameters filterParameters = commonFilterOptions;
+
+ @Parameter(names = {"--dataset"}, description = "Target dataset.", arity = 1)
+ public String datasetId = null;
+
@Parameter(names = {"-i", "--input"}, description = "Input file name (in Avro/Parquet file format).",
required = true, arity = 1)
public String inFilename;
-
+/*
@Parameter(names = {"-m", "--metadata"}, description = "Input metadata file name.",
required = true, arity = 1)
public String metaFilename;
@@ -343,12 +366,8 @@ public class RvTestsVariantCommandOptions {
required = true, arity = 1)
public String outDirname;
- @Parameter(names = {"--dataset"}, description = "Target dataset.",
- arity = 1)
- public String datasetId = null;
-
- @Parameter(names = {"-c", "--config"}, description = "Configuration file name containing the rvtests parameters.",
- required = true, arity = 1)
- public String confFilename;
+*/
+ @DynamicParameter(names = "-D", description = "RvTests parameters")
+ public Map rvtestsParams = new HashMap<>();
}
}
diff --git a/hpg-bigdata-app/src/test/java/org/opencb/hpg/bigdata/app/cli/local/VariantRvTestsCLITest.java b/hpg-bigdata-app/src/test/java/org/opencb/hpg/bigdata/app/cli/local/VariantRvTestsCLITest.java
index 2a3e64b0..e25cd198 100644
--- a/hpg-bigdata-app/src/test/java/org/opencb/hpg/bigdata/app/cli/local/VariantRvTestsCLITest.java
+++ b/hpg-bigdata-app/src/test/java/org/opencb/hpg/bigdata/app/cli/local/VariantRvTestsCLITest.java
@@ -2,7 +2,6 @@
import org.junit.Test;
-import java.net.URISyntaxException;
import java.nio.file.Path;
import java.nio.file.Paths;
@@ -10,36 +9,60 @@
* Created by joaquin on 1/19/17.
*/
public class VariantRvTestsCLITest {
- Path inPath;
- Path outPath;
- Path confPath;
-
- private void init() throws URISyntaxException {
- String root = "/home/jtarraga/data100/vcf/rvtest-skat/spark";
-// String root = "/home/jtarraga/data/vcf/skat/";
- inPath = Paths.get(root + "/example.vcf.avro");
- outPath = Paths.get(root + "/out");
- confPath = Paths.get(root + "/skat.params");
+ String datasetName = "test";
+
+ String vcfFilename = "../hpg-bigdata-app/src/test/resources/example.vcf";
+ String phenoFilename = "../hpg-bigdata-app/src/test/resources/pheno";
+ String outDir = "/tmp/";
+
+ Path vcfPath;
+ Path phenoPath;
+ Path avroPath;
+ Path metaPath;
+
+ private void init() throws Exception {
+ vcfPath = Paths.get(vcfFilename);
+ phenoPath = Paths.get(phenoFilename);
+ avroPath = Paths.get(outDir + "/" + vcfPath.getFileName() + ".avro");
+ metaPath = Paths.get(outDir + "/" + vcfPath.getFileName() + ".avro.meta.json");
+
+ avroPath.toFile().delete();
+ metaPath.toFile().delete();
+
+ // convert vcf to avro
+ StringBuilder commandLine = new StringBuilder();
+ commandLine.append(" variant convert");
+ commandLine.append(" --log-level ERROR");
+ commandLine.append(" -i ").append(vcfPath);
+ commandLine.append(" -o ").append(avroPath);
+ commandLine.append(" --dataset ").append(datasetName);
+ VariantQueryCLITest.execute(commandLine.toString());
+
+ // load pedigree file
+ commandLine.setLength(0);
+ commandLine.append(" variant metadata");
+ commandLine.append(" --log-level ERROR");
+ commandLine.append(" -i ").append(avroPath);
+ commandLine.append(" --load-pedigree ").append(phenoPath);
+ commandLine.append(" --dataset ").append(datasetName);
+ VariantQueryCLITest.execute(commandLine.toString());
}
@Test
public void skat() {
-
try {
init();
StringBuilder commandLine = new StringBuilder();
commandLine.append(" variant rvtests");
commandLine.append(" --log-level ERROR");
- commandLine.append(" -i ").append(inPath);
- commandLine.append(" -o ").append(outPath);
- commandLine.append(" -c ").append(confPath);
- commandLine.append(" --dataset noname");
-
+ commandLine.append(" --dataset ").append(datasetName);
+ commandLine.append(" -i ").append(avroPath);
+ commandLine.append(" -Dsingle=wald");
+ commandLine.append(" -Dout=/tmp/out.wald");
VariantQueryCLITest.execute(commandLine.toString());
} catch (Exception e) {
e.printStackTrace();
}
}
-
}
diff --git a/hpg-bigdata-app/src/test/resources/example.vcf b/hpg-bigdata-app/src/test/resources/example.vcf
new file mode 100755
index 00000000..a2358409
--- /dev/null
+++ b/hpg-bigdata-app/src/test/resources/example.vcf
@@ -0,0 +1,9 @@
+##fileformat=VCFv4.1
+##FILTER=
+##fileDate=20140730
+##reference=ftp://ftp.1000genomes.ebi.ac.uk//vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz
+##source=1000GenomesPhase3Pipeline
+#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT P1 P2 P3 P4 P5 P6 P7 P8 P9
+1 1 . A G 100 . . GT 0/1 0/0 0/0 1/1 0/1 0/0 0/1 0/1 0/0
+1 2 . A G 100 . . GT 0/0 0/0 0/0 0/0 0/0 0/0 0/1 0/0 0/1
+1 3 . A G 100 . . GT 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/0 0/0
diff --git a/hpg-bigdata-app/src/test/resources/pheno b/hpg-bigdata-app/src/test/resources/pheno
new file mode 100755
index 00000000..608afd9b
--- /dev/null
+++ b/hpg-bigdata-app/src/test/resources/pheno
@@ -0,0 +1,10 @@
+#fid iid fatid matid sex phenotype y1 y2 y3 y4
+P1 P1 0 0 0 2 1.911 -1.465 -0.817 1
+P2 P2 0 0 2 2 2.146 -2.451 -0.178 2
+P3 P3 0 0 2 1 1.086 -1.194 -0.899 1
+P4 P4 0 0 2 1 0.704 -1.052 -0.237 1
+P5 P5 0 0 1 2 2.512 -3.085 -2.579 1
+P6 P6 0 0 2 2 1.283 -1.294 -0.342 1
+P7 P7 0 0 1 2 2.384 1.070 -1.522 2
+P8 P8 0 0 1 1 3.004 0.680 -1.875 1
+P9 P9 0 0 1 1 0.714 -0.116 -1.604 1