Skip to content

Commit

Permalink
analysis: major improvements in rvtests and plink analsys #125 #128
Browse files Browse the repository at this point in the history
  • Loading branch information
jtarraga committed Sep 13, 2017
1 parent 2b89706 commit fb84932
Show file tree
Hide file tree
Showing 19 changed files with 661 additions and 545 deletions.
5 changes: 5 additions & 0 deletions hpg-bigdata-analysis/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@
<groupId>junit</groupId>
<artifactId>junit</artifactId>
</dependency>
<dependency>
<groupId>com.beust</groupId>
<artifactId>jcommander</artifactId>
<version>1.58</version>
</dependency>
</dependencies>

</project>
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,19 @@
* Created by jtarraga on 30/01/17.
*/
public abstract class AnalysisExecutor {
protected String datasetName;
protected String studyId;

public String getDatasetName() {
return datasetName;
protected AnalysisExecutor(String studyId) {
this.studyId = studyId;
}

public void setDatasetName(String datasetName) {
this.datasetName = datasetName;
protected String studyId() {
return studyId;
}

public abstract void execute() throws AnalysisExecutorException;
protected void setStudyId(String studyId) {
this.studyId = studyId;
}

protected abstract void execute() throws AnalysisExecutorException;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package org.opencb.hpg.bigdata.analysis.variant;

import com.beust.jcommander.Parameter;

public class FilterParameters {
@Parameter(names = {"--id"}, description = "Query for ID; comma separated list of IDs, e.g.:"
+ " \"rs312411,rs421225\"", arity = 1)
public String ids;

@Parameter(names = {"--id-file"}, description = "Query for ID that are stored in a file, one ID per line,"
+ " e.g.: rs312411", arity = 1)
public String idFilename;

@Parameter(names = {"--type"}, description = "Query for type; comma separated list of IDs, e.g.:"
+ " \"INDEL,SNP,SNV\"", arity = 1)
public String types;

@Parameter(names = {"--s", "--study"}, description = "Query for study; comma separated list of study names",
arity = 1)
public String studies;

@Parameter(names = {"--biotype"}, description = "Query for biotype; comma separated list of biotype names,"
+ " e.g.: protein_coding, pseudogene", arity = 1)
public String biotypes;

@Parameter(names = {"-r", "--region"}, description = "Query for region; comma separated list of regions,"
+ " e.g.: 1:300000-400000000,15:343453463-8787665654", arity = 1)
public String regions;

@Parameter(names = {"--region-file"}, description = "Query for regions that are stored in a file, one region"
+ " per line, e.g.: 1:6700000-560000000", arity = 1)
public String regionFilename;

@Parameter(names = {"--maf"}, description = "Query for the Minor Allele Frequency of a given study and"
+ " cohort. Use the following format enclosed with double quotes: \"study_name::cohort_name"
+ "[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all>0.4\"", arity = 1)
public String maf;

@Parameter(names = {"--mgf"}, description = "Query for the Minor Genotype Frequency of a given study and"
+ " cohort. Use the following format enclosed with double quotes: \"study_name::cohort_name"
+ "[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all>0.18198\"", arity = 1)
public String mgf;

@Parameter(names = {"--missing-allele"}, description = "Query for the number of missing alleles of a given"
+ " study and cohort. Use the following format enclosed with double quotes: \"study_name::cohort_name"
+ "[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all==5\"", arity = 1)
public String missingAlleles;

@Parameter(names = {"--missing-genotype"}, description = "Query for the number of missing genotypes of a"
+ " given study and cohort. Use the following format enclosed with double quotes: \"study_name::"
+ "cohort_name[<|>|<=|>=|==|!=]value\", e.g.: \"1000g::all!=0\"", arity = 1)
public String missingGenotypes;

@Parameter(names = {"--ct", "--consequence-type"}, description = "Query for Sequence Ontology term names or"
+ " accession codes; comma separated (use double quotes if you provide term names), e.g.:"
+ " \"transgenic insertion,SO:32234,SO:00124\"", arity = 1)
public String consequenceTypes;

@Parameter(names = {"--gene"}, description = "Query for gene; comma separated list of gene names, e.g.:"
+ " \"BIN3,ZNF517\"", arity = 1)
public String genes;

@Parameter(names = {"--clinvar"}, description = "Query for clinvar (accession); comma separated list of"
+ " accessions", arity = 1)
public String clinvar;

@Parameter(names = {"--cosmic"}, description = "Query for cosmic (mutation ID); comma separated list of"
+ " mutations IDs", arity = 1)
public String cosmic;

// @Parameter(names = {"--gwas"}, description = "Query for gwas (traits); comma separated list of traits",
// arity = 1)
// public String gwas;

@Parameter(names = {"--conservation"}, description = "Query for conservation scores (phastCons, phylop, gerp);"
+ "comma separated list of scores and enclosed with double quotes, e.g.: \"phylop<0.3,phastCons<0.1\"",
arity = 1)
public String conservScores;

@Parameter(names = {"--ps", "--protein-substitution"}, description = "Query for protein substitution scores"
+ " (polyphen, sift); comma separated list of scores and enclosed with double quotes, e.g.:"
+ "\"polyphen>0.3,sift>0.6\"", arity = 1)
public String substScores;

@Parameter(names = {"--pf", "--population-frequency"}, description = "Query for alternate population"
+ " frequency of a given study. Use the following format enclosed with double quotes:"
+ " \"study_name::population_name[<|>|<=|>=|==|!=]frequency_value\", e.g.: \"1000g::CEU<0.4\"",
arity = 1)
public String pf;

@Parameter(names = {"--pmaf", "--population-maf"}, description = "Query for population minor allele frequency"
+ " of a given study. Use the following the format enclosed with double quotes: \"study_name::"
+ "population_name[<|>|<=|>=|==|!=]frequency_value\", e.g.: \"1000g::PJL<=0.25\"", arity = 1)
public String pmaf;

@Parameter(names = {"--sample-genotype"}, description = "Query for sample genotypes. Use the following the"
+ " format enclosed with double quotes: \"sample_name1:genotype1,genotype;sample_name2:genotype1\","
+ " e.g.: \"HG00112:0/0;HG23412:1/0,1/1\"", arity = 1)
public String sampleGenotypes;

@Parameter(names = {"--sample-filter"}, description = "Query for sample filter, i.e.: individual attributes (family, father,"
+ " mother, sex and phenotype) and user-defined attributes from pedigree information,"
+ " e.g.: \"individual.sex=MALE;Eyes=Blue\"", arity = 1)
public String sampleFilters;
}
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,13 @@ public void execute() {
System.out.println("r2: " + trainingSummary.r2());
}

public LinearRegressionAnalysis(String datasetName, String studyName, String depVarName, String indepVarName) {
this(datasetName, studyName, depVarName, indepVarName, 10, 0.3, 0.8);
public LinearRegressionAnalysis(String studyId, String depVarName, String indepVarName) {
this(studyId, depVarName, indepVarName, 10, 0.3, 0.8);
}

public LinearRegressionAnalysis(String datasetName, String studyName, String depVarName, String indepVarName,
public LinearRegressionAnalysis(String studyId, String depVarName, String indepVarName,
int numIterations, double regularization, double elasticNet) {
this.datasetName = datasetName;
this.studyName = studyName;
super(studyId);
this.depVarName = depVarName;
this.indepVarName = indepVarName;
this.numIterations = numIterations;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,13 @@ public void execute() {
lrModel.setThreshold(bestThreshold);
}

public LogisticRegressionAnalysis(String datasetName, String studyName, String depVarName, String indepVarName) {
this(datasetName, studyName, depVarName, indepVarName, 10, 0.3, 0.8);
public LogisticRegressionAnalysis(String studyId, String depVarName, String indepVarName) {
this(studyId, depVarName, indepVarName, 10, 0.3, 0.8);
}

public LogisticRegressionAnalysis(String datasetName, String studyName, String depVarName, String indepVarName,
public LogisticRegressionAnalysis(String studyId, String depVarName, String indepVarName,
int numIterations, double regularization, double elasticNet) {
this.datasetName = datasetName;
this.studyName = studyName;
super(studyId);
this.depVarName = depVarName;
this.indepVarName = indepVarName;
this.numIterations = numIterations;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,12 @@ public void execute() {
result.show(false);
}

public PCAAnalysis(String datasetName, String studyName, String featureName) {
this(datasetName, studyName, featureName, 3);
public PCAAnalysis(String studyId, String studyName, String featureName) {
this(studyId, studyName, featureName, 3);
}

public PCAAnalysis(String datasetName, String studyName, String featureName, int kValue) {
this.datasetName = datasetName;
this.studyName = studyName;
public PCAAnalysis(String studyId, String studyName, String featureName, int kValue) {
super(studyId);
this.featureName = featureName;
this.kValue = kValue;
}
Expand Down
Loading

0 comments on commit fb84932

Please sign in to comment.