Merge branch 'develop'. Version update to 1.0.1. Close #28

mikessh · Jun 30, 2015 · 89a9568 · 89a9568
2 parents 588316e + e83c2e5
commit 89a9568
Show file tree

Hide file tree

Showing 32 changed files with 1,080 additions and 27 deletions.
diff --git a/README.md b/README.md
@@ -6,7 +6,8 @@
 
 A comprehensive framework for post-analysis of immune repertoire sequencing data.
 Compiled binaries are available from [here](https://github.com/mikessh/vdjtools/releases/latest).
-The software is cross-platform and requires Java v1.7+ to run and R to generate publication-ready graphics.
+The software is cross-platform and requires Java v1.8 (it can still be compiled from source under 
+JRE 1.7 though) to run and R to generate publication-ready graphics.
 
 List of features and detailed documentation can be found at [ReadTheDocs](http://vdjtools-doc.readthedocs.org/en/latest/).
 

diff --git a/pom.xml b/pom.xml
@@ -20,7 +20,7 @@
 
     <groupId>com.antigenomics</groupId>
     <artifactId>vdjtools</artifactId>
-    <version>1.0.0</version>
+    <version>1.0.1</version>
     <packaging>jar</packaging>
 
     <name>vdjtools</name>
@@ -35,9 +35,8 @@
             <resource>
                 <directory>src/main/resources</directory>
                 <includes>
-                    <include>segments/*</include>
                     <include>rscripts/*</include>
-                    <include>pwm/*</include>
+                    <include>profile/*</include>
                 </includes>
             </resource>
         </resources>

diff --git a/src/main/groovy/com/antigenomics/vdjtools/VdjTools.groovy b/src/main/groovy/com/antigenomics/vdjtools/VdjTools.groovy
@@ -26,6 +26,7 @@ import com.antigenomics.vdjtools.operate.JoinSamples
 import com.antigenomics.vdjtools.operate.PoolSamples
 import com.antigenomics.vdjtools.overlap.*
 import com.antigenomics.vdjtools.preprocess.*
+import com.antigenomics.vdjtools.profile.CalcCdrAAProfile
 import com.antigenomics.vdjtools.util.Convert
 import com.antigenomics.vdjtools.util.ExecUtil
 import com.antigenomics.vdjtools.util.RInstall
@@ -75,6 +76,7 @@ def printHelp = {
     println ""
     println "[Annotation]"
     println "ScanDatabase"
+    println "CalcCdrAAProfile"
     println ""
     println "[Util]"
     println "Convert"
@@ -134,6 +136,8 @@ def getScript = { String scriptName ->
 
         case "SCANDATABASE":
             return new ScanDatabase()
+        case "CALCCDRAAPROFILE":
+            return new CalcCdrAAProfile()
 
         case "CONVERT":
             return new Convert()
@@ -164,7 +168,7 @@ else {
     try {
         ExecUtil.run(script, args.length > 1 ? args[1..-1] : [""])
     } catch (Exception e) {
-        println "[ERROR] $e.message, see _vdjtools_error.log for details"
+        println "[ERROR] ${e.toString()}, see _vdjtools_error.log for details"
         new File("_vdjtools_error.log").withWriterAppend { writer ->
             writer.println("[${new Date()} BEGIN]")
             writer.println("[Script]")

diff --git a/src/main/groovy/com/antigenomics/vdjtools/io/parser/MiXcrParser.groovy b/src/main/groovy/com/antigenomics/vdjtools/io/parser/MiXcrParser.groovy
@@ -30,7 +30,8 @@ public class MiXcrParser extends ClonotypeStreamParser {
     boolean initialized = false;
     int countColumn, freqColumn, cdr3ntColumn, cdr3aaColumn,
         vHitsColumn, dHitsColumn, jHitsColumn,
-        vAlignmentsColumn, dAlignmentsColumn, jAlignmentsColumn
+        vAlignmentsColumn, dAlignmentsColumn, jAlignmentsColumn,
+        numberOfColumns
 
     /**
      * {@inheritDoc}
@@ -67,6 +68,8 @@ public class MiXcrParser extends ClonotypeStreamParser {
                 vAlignmentsColumn == -1 || dAlignmentsColumn == -1 || jAlignmentsColumn == -1)
             throw new RuntimeException("Some mandatory columns are absent in the input file.");
 
+        numberOfColumns = splitHeaderLine.size()
+
         // Initialized
         initialized = true;
     }
@@ -78,7 +81,7 @@ public class MiXcrParser extends ClonotypeStreamParser {
     protected Clonotype innerParse(String clonotypeString) {
         ensureInitialized()
 
-        def splitString = clonotypeString.split(software.delimiter)
+        def splitString = clonotypeString.split(software.delimiter, numberOfColumns)
 
         def count = splitString[countColumn].toInteger()
         def freq = splitString[freqColumn].toDouble()
@@ -87,18 +90,21 @@ public class MiXcrParser extends ClonotypeStreamParser {
 
         def cdr3aa = splitString[cdr3aaColumn] // no need to unify, MiXCR is based on milib
 
-
         String v, d, j
         (v, d, j) = extractVDJ(splitString[[vHitsColumn, dHitsColumn, jHitsColumn]])
 
         List<Alignment> vAlignemtns = parseAlignments(splitString[vAlignmentsColumn])
         List<Alignment> dAlignemtns = parseAlignments(splitString[dAlignmentsColumn])
         List<Alignment> jAlignemtns = parseAlignments(splitString[jAlignmentsColumn])
 
-        def segmPoints = [vAlignemtns[0].seq2End - 1,
-                          dAlignemtns.size() > 0 ? dAlignemtns[0].seq2Begin : -1,
-                          dAlignemtns.size() > 0 ? dAlignemtns[0].seq2End - 1 : -1,
-                          jAlignemtns[0].seq2Begin] as int[]
+        def segmPoints = [vAlignemtns.size() > 0 && vAlignemtns[0] != null ?
+                                  vAlignemtns[0].seq2End - 1 : 0,
+                          dAlignemtns.size() > 0 && dAlignemtns[0] != null ?
+                                  dAlignemtns[0].seq2Begin : -1,
+                          dAlignemtns.size() > 0 && dAlignemtns[0] != null ?
+                                  dAlignemtns[0].seq2End - 1 : -1,
+                          jAlignemtns.size() > 0 && jAlignemtns[0] != null ?
+                                  jAlignemtns[0].seq2Begin : cdr3nt.size() - 1] as int[]
 
         boolean inFrame = inFrame(cdr3aa),
                 noStop = noStop(cdr3aa),
@@ -113,12 +119,16 @@ public class MiXcrParser extends ClonotypeStreamParser {
         if (alignmentsLine.isEmpty())
             return Collections.EMPTY_LIST
 
-        String[] splitByAlignments = alignmentsLine.split(",")
+        String[] splitByAlignments = alignmentsLine.split(";", -1)
         List<Alignment> ret = new ArrayList<>()
         for (String alignmentString : splitByAlignments) {
-            String[] splitByFields = alignmentString.split("\\|")
-            ret.add(new Alignment(splitByFields[0].toInteger(), splitByFields[1].toInteger(),
-                    splitByFields[3].toInteger(), splitByFields[4].toInteger()));
+            if (alignmentString.trim().empty) {
+                ret.add(null)
+            } else {
+                String[] splitByFields = alignmentString.split("\\|")
+                ret.add(new Alignment(splitByFields[0].toInteger(), splitByFields[1].toInteger(),
+                        splitByFields[3].toInteger(), splitByFields[4].toInteger()));
+            }
         }
         return ret;
     }

diff --git a/src/main/groovy/com/antigenomics/vdjtools/profile/BasicAminoAcidProperties.groovy b/src/main/groovy/com/antigenomics/vdjtools/profile/BasicAminoAcidProperties.groovy
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2013-2015 Mikhail Shugay (mikhail.shugay@gmail.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.antigenomics.vdjtools.profile
+
+import com.antigenomics.vdjtools.util.CommonUtil
+
+class BasicAminoAcidProperties {
+    static final BasicAminoAcidProperties INSTANCE = new BasicAminoAcidProperties()
+
+    private final AminoAcidProperty[] aminoAcidProperties
+
+    private BasicAminoAcidProperties() {
+        aminoAcidProperties = AminoAcidProperty.fromInput(CommonUtil.resourceStream("profile/aa_property_table.txt"))
+    }
+
+    List<String> getPropertyNames() {
+        aminoAcidProperties.collect { it.name }
+    }
+
+    AminoAcidProperty[] getProperties(List<String> propertyNames) {
+        if (propertyNames.isEmpty())
+            return getProperties()
+
+        aminoAcidProperties.findAll { propertyNames.contains(it.name) } as AminoAcidProperty[]
+    }
+
+    AminoAcidProperty[] getProperties() {
+        Arrays.copyOf(aminoAcidProperties, aminoAcidProperties.length)
+    }
+}
diff --git a/src/main/groovy/com/antigenomics/vdjtools/profile/CalcCdrAAProfile.groovy b/src/main/groovy/com/antigenomics/vdjtools/profile/CalcCdrAAProfile.groovy
@@ -0,0 +1,152 @@
+/*
+ * Copyright 2013-2015 Mikhail Shugay (mikhail.shugay@gmail.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.antigenomics.vdjtools.profile
+
+import com.antigenomics.vdjtools.sample.Sample
+import com.antigenomics.vdjtools.sample.SampleCollection
+import com.antigenomics.vdjtools.sample.metadata.MetadataTable
+import com.antigenomics.vdjtools.util.RUtil
+
+import static com.antigenomics.vdjtools.util.ExecUtil.formOutputPath
+import static com.antigenomics.vdjtools.util.ExecUtil.toPlotPath
+
+
+def DEFAULT_AA_PROPERTIES = BasicAminoAcidProperties.INSTANCE.propertyNames.join(","),
+    DEFAULT_BINNING = "V-germ:5,VJ-junc:3,J-germ:5"
+
+def cli = new CliBuilder(usage: "CalcCdrAAProfile [options] " +
+        "[sample1 sample2 sample3 ... if -m is not specified] output_prefix")
+cli.h("display help message")
+cli.m(longOpt: "metadata", argName: "filename", args: 1,
+        "Metadata file. First and second columns should contain file name and sample id. " +
+                "Header is mandatory and will be used to assign column names for metadata.")
+cli.u(longOpt: "unweighted", "Will count each clonotype only once, apart from conventional frequency-weighted histogram.")
+cli.o(longOpt: "property-list", argName: "group1,...", args: 1,
+        "Comma-separated list of amino-acid properties to analyze. " +
+                "Allowed values: $DEFAULT_AA_PROPERTIES. " +
+                "[default = use all]")
+cli.r(longOpt: "region-list", argName: "segment1:nbins1,...", args: 1,
+        "List of segments to analyze and corresponding bin counts. " +
+                "Allowed segments: ${KnownCdr3Regions.INSTANCE.regionNames.join(",")}. " +
+                "[default = $DEFAULT_BINNING]")
+cli.p(longOpt: "plot", "Plot amino acid property distributions for a specified list of segments.")
+cli.f(longOpt: "factor", argName: "string", args: 1, "Metadata entry used to group samples in plot.")
+cli._(longOpt: "plot-type", argName: "<pdf|png>", args: 1, "Plot output format [default=pdf]")
+cli._(longOpt: "include-cfw", "Consider first and last AAs of CDR3, which are normally conserved C and F/W")
+
+
+def opt = cli.parse(args)
+
+if (opt == null)
+    System.exit(-1)
+
+if (opt.h || opt.arguments().size() == 0) {
+    cli.usage()
+    System.exit(-1)
+}
+
+// Check if metadata is provided
+
+def metadataFileName = opt.m
+
+if (metadataFileName ? opt.arguments().size() != 1 : opt.arguments().size() < 2) {
+    if (metadataFileName)
+        println "Only output prefix should be provided in case of -m"
+    else
+        println "At least 1 sample files should be provided if not using -m"
+    cli.usage()
+    System.exit(-1)
+}
+
+// Remaining arguments
+
+def outputFilePrefix = opt.arguments()[-1],
+    unweighted = (boolean) opt.u,
+    binning = (opt.r ?: DEFAULT_BINNING).split(",").collectEntries {
+        def split2 = it.split(":")
+        [(KnownCdr3Regions.INSTANCE.getByName(split2[0])): split2[1].toInteger()]
+    },
+    properties = (opt.o ?: DEFAULT_AA_PROPERTIES).split(","),
+    plot = (boolean) opt.p,
+    plotType = (opt.'plot-type' ?: "pdf").toString(),
+    includeCFW = (boolean) opt.'include-cfw'
+
+def badProperties = properties.findAll { !BasicAminoAcidProperties.INSTANCE.propertyNames.contains(it) }
+
+if (badProperties.size() > 0) {
+    println "[ERROR] Unknown amino acid properties: ${badProperties.join(",")}. " +
+            "Allowed values are $DEFAULT_AA_PROPERTIES"
+    System.exit(-1)
+}
+
+def scriptName = getClass().canonicalName.split("\\.")[-1]
+
+//
+// Batch load all samples (lazy)
+//
+
+println "[${new Date()} $scriptName] Reading sample(s)"
+
+def sampleCollection = metadataFileName ?
+        new SampleCollection((String) metadataFileName) :
+        new SampleCollection(opt.arguments()[0..-2])
+
+println "[${new Date()} $scriptName] ${sampleCollection.size()} sample(s) prepared"
+
+//
+// Compute and output diversity measures, spectratype, etc
+//
+
+def profileBuilder = new Cdr3AAProfileBuilder(binning, !unweighted, !includeCFW, properties)
+
+def outputFileName = formOutputPath(outputFilePrefix, "cdr3aa", "profile", (unweighted ? "unwt" : "wt"))
+
+new File(outputFileName).withPrintWriter { pw ->
+    def header = "#$MetadataTable.SAMPLE_ID_COLUMN\t" +
+            sampleCollection.metadataTable.columnHeader + "\t" +
+            "cdr3.segment\tbin\tproperty\tvalue\ttotal"
+
+    pw.println(header)
+
+    def sampleCounter = 0
+
+    sampleCollection.each { Sample sample ->
+        def profiles = profileBuilder.create(sample)
+
+        println "[${new Date()} $scriptName] ${++sampleCounter} sample(s) processed"
+
+        profiles.each { profileEntry ->
+            def segmentName = profileEntry.key.name
+            profileEntry.value.bins.each { bin ->
+                bin.summary.each {
+                    pw.println([sample.sampleMetadata.sampleId, sample.sampleMetadata,
+                                segmentName, bin.index, it.key,
+                                it.value, bin.total].join("\t"))
+                }
+            }
+        }
+    }
+}
+if (plot) {
+    RUtil.execute("cdr3aa_profile.r",
+            outputFileName,
+            toPlotPath(outputFileName, plotType),
+            opt.f ? (sampleCollection.metadataTable.getColumnIndex(opt.f) + 2).toString() : "0",
+    )
+}
+
+println "[${new Date()} $scriptName] Finished"
diff --git a/src/main/groovy/com/antigenomics/vdjtools/profile/Cdr3AAProfileBuilder.groovy b/src/main/groovy/com/antigenomics/vdjtools/profile/Cdr3AAProfileBuilder.groovy
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2013-2015 Mikhail Shugay (mikhail.shugay@gmail.com)
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.antigenomics.vdjtools.profile
+
+import com.antigenomics.vdjtools.sample.Clonotype
+
+class Cdr3AAProfileBuilder {
+    private final Map<Cdr3Region, Integer> binning
+    private final boolean weighted, excludeCysPhe
+    private final List<String> groups
+
+    Cdr3AAProfileBuilder(Map<Cdr3Region, Integer> binning, boolean weighted,
+                         boolean excludeCysPhe,
+                         String... groups) {
+        this.binning = binning
+        this.weighted = weighted
+        this.excludeCysPhe = excludeCysPhe
+        this.groups = groups
+    }
+
+    Map<Cdr3Region, AminoAcidProfile> create(Iterable<Clonotype> clonotypes) {
+        def profiles = new HashMap<Cdr3Region, AminoAcidProfile>()
+
+        binning.each {
+            profiles.put(it.key, new AminoAcidProfile(it.value,
+                    BasicAminoAcidProperties.INSTANCE.getProperties(groups)))
+        }
+
+        clonotypes.each { clonotype ->
+            if (clonotype.isCoding()) {
+                profiles.each {
+                    def aaSeq = it.key.extractAminoAcid(clonotype, excludeCysPhe)
+                    if (aaSeq.size() > 0) {
+                        it.value.update(aaSeq, weighted ? clonotype.freq : 1.0d)
+                    }
+                }
+            }
+        }
+
+        profiles
+    }
+}