-
Notifications
You must be signed in to change notification settings - Fork 309
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Modified CalculateDepth to calcuate coverage from alignment files
- Loading branch information
1 parent
e7e1adf
commit 1624df6
Showing
8 changed files
with
322 additions
and
1 deletion.
There are no files selected for viewing
95 changes: 95 additions & 0 deletions
95
adam-cli/src/main/scala/org/bdgenomics/adam/cli/Reads2Coverage.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
/** | ||
* Licensed to Big Data Genomics (BDG) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The BDG licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.bdgenomics.adam.cli | ||
|
||
import org.apache.spark.SparkContext | ||
import org.bdgenomics.adam.projections.AlignmentRecordField._ | ||
import org.bdgenomics.adam.projections.Projection | ||
import org.bdgenomics.adam.rdd.ADAMContext._ | ||
import org.bdgenomics.adam.rdd.features.CoverageRDD | ||
import org.bdgenomics.adam.rdd.read.{ AlignedReadRDD, AlignmentRecordRDD } | ||
import org.bdgenomics.utils.cli._ | ||
import org.kohsuke.args4j.{ Argument, Option => Args4jOption } | ||
|
||
/** | ||
* Reads2Coverage (accessible as the command 'coverage' through the CLI) takes two arguments, | ||
* an Read file and an output file, and calculates the number of reads from the Read file | ||
* at every location in the file. Optional arguments are negativeStrands, and positiveStrands, | ||
* which only save coverage computed from negative and positive strands, respectively. | ||
*/ | ||
object Reads2Coverage extends BDGCommandCompanion { | ||
val commandName: String = "coverage" | ||
val commandDescription: String = "Calculate the depth from a given ADAM file" | ||
|
||
def apply(cmdLine: Array[String]): BDGCommand = { | ||
new Reads2Coverage(Args4j[Reads2CoverageArgs](cmdLine)) | ||
} | ||
} | ||
|
||
class Reads2CoverageArgs extends Args4jBase with ParquetArgs { | ||
@Argument(required = true, metaVar = "INPUT", usage = "The reads file to use to calculate depths", index = 0) | ||
var inputPath: String = null | ||
@Argument(required = true, metaVar = "OUTPUT", usage = "Location to write the coverage data in ADAM/Parquet format", index = 1) | ||
var outputPath: String = null | ||
@Args4jOption(required = false, name = "-negativeStrands", usage = "Compute coverage for negative strands") | ||
var negativeStrands: Boolean = false | ||
@Args4jOption(required = false, name = "-positiveStrands", usage = "Compute coverage for positive strands") | ||
var positiveStrands: Boolean = false | ||
} | ||
|
||
class Reads2Coverage(protected val args: Reads2CoverageArgs) extends BDGSparkCommand[Reads2CoverageArgs] { | ||
val companion: BDGCommandCompanion = CalculateDepth | ||
|
||
def run(sc: SparkContext): Unit = { | ||
|
||
val proj = Projection(contigName, start, end, cigar) | ||
|
||
// load reads | ||
val readsRDD: AlignmentRecordRDD = sc.loadAlignments(args.inputPath, projection = Some(proj)) | ||
|
||
// if strand direction is not specified, save unified coverage | ||
if (!args.negativeStrands && !args.positiveStrands) { | ||
|
||
// save final features | ||
val featureRDD: CoverageRDD = readsRDD.toCoverage | ||
featureRDD.save(args.outputPath) | ||
|
||
} else { | ||
|
||
// if negative strand, save coverage of only negative strands | ||
if (args.negativeStrands) { | ||
// count sites for all strands | ||
val negativeReadsRDD: AlignmentRecordRDD = AlignedReadRDD(readsRDD.rdd.filter(_.getReadNegativeStrand), readsRDD.sequences, readsRDD.recordGroups) | ||
|
||
// save final features | ||
val coverageRDD: CoverageRDD = negativeReadsRDD.toCoverage | ||
coverageRDD.save(s"negative_${args.outputPath}") | ||
} | ||
|
||
// if positive strand, save coverage of only positive strands | ||
if (args.positiveStrands) { | ||
// count sites for all strands | ||
val positiveReadsRDD: AlignmentRecordRDD = AlignedReadRDD(readsRDD.rdd.filter(!_.getReadNegativeStrand), readsRDD.sequences, readsRDD.recordGroups) | ||
|
||
// save final features | ||
val coverageRDD: CoverageRDD = positiveReadsRDD.toCoverage | ||
coverageRDD.save(s"negative_${args.outputPath}") | ||
} | ||
} | ||
} | ||
} |
37 changes: 37 additions & 0 deletions
37
adam-cli/src/test/scala/org/bdgenomics/adam/cli/Reads2CoverageSuite.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/** | ||
* Licensed to Big Data Genomics (BDG) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The BDG licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.bdgenomics.adam.cli | ||
|
||
import org.bdgenomics.adam.rdd.ADAMContext._ | ||
import org.bdgenomics.adam.util.ADAMFunSuite | ||
import org.bdgenomics.utils.cli.Args4j | ||
|
||
class Reads2CoverageSuite extends ADAMFunSuite { | ||
|
||
sparkTest("correctly calculates coverage from small sam file") { | ||
val inputPath = copyResource("artificial.sam") | ||
val outputPath = tmpFile("coverage.adam") | ||
|
||
val args: Array[String] = Array(inputPath, outputPath) | ||
new Reads2Coverage(Args4j[Reads2CoverageArgs](args)).run(sc) | ||
val coverage = sc.loadCoverage(outputPath) | ||
|
||
val pointCoverage = coverage.rdd.filter(_.position == 30).first | ||
assert(pointCoverage.count == 5) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
85 changes: 85 additions & 0 deletions
85
adam-core/src/main/scala/org/bdgenomics/adam/rdd/features/CoverageRDD.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
/** | ||
* Licensed to Big Data Genomics (BDG) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The BDG licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.bdgenomics.adam.rdd.features | ||
|
||
import org.apache.spark.rdd.RDD | ||
import org.bdgenomics.adam.models.ReferenceRegion | ||
|
||
/** | ||
* RDD holding run length encoded coverage in a FeatureRDD. CoverageRDD stores | ||
* coverage counts at a ReferenceRegion in the score attribute of | ||
* a feature. | ||
* | ||
* @param featureRDD FeatureRDD holding records for coverage and corresponding | ||
* referenceRegion | ||
*/ | ||
case class CoverageRDD(featureRDD: FeatureRDD) { | ||
|
||
/** | ||
* Java friendly save function. Automatically detects the output format. | ||
* | ||
* If the filename ends in ".bed", we write a BED file. If the file name ends | ||
* in ".narrow[pP]eak", we save in the NarrowPeak format. If the file name | ||
* ends in ".interval_list", we save in the interval list format. Else, we | ||
* save as Parquet. These files are written as sharded text files. | ||
* | ||
* @param filePath The location to write the output. | ||
*/ | ||
def save(filePath: java.lang.String) = { | ||
featureRDD.save(filePath) | ||
} | ||
|
||
/** | ||
* Gets coverage overlapping specified ReferenceRegion. For large, ReferenceRegions, | ||
* base pairs per bin (bpPerBin) can be specified to bin together ReferenceRegions of | ||
* equal size. | ||
* | ||
* @param region ReferenceRegion to fetch overlapping coverage from | ||
* @param bpPerBin base pairs per bin, number of bases to combine to one bin | ||
* @return RDD of Coverage Records | ||
*/ | ||
def getCoverage(region: ReferenceRegion, bpPerBin: Int = 1): RDD[Coverage] = { | ||
val coverage = featureRDD.filterByOverlappingRegion(region) | ||
|
||
val flattenedCoverage = coverage.rdd.flatMap(r => { | ||
val positions: List[Long] = List.range(r.getStart, r.getEnd) | ||
positions.map(n => Coverage(r.getContigName, n, r.getScore)) | ||
}).filter(r => r.position >= region.start && r.position < region.end) // filter out positions from flanking regions | ||
|
||
flattenedCoverage.filter(r => r.position % bpPerBin == 0) | ||
} | ||
|
||
/** | ||
* Gets raw RDD of coverage | ||
* @return RDD[Coverage] from underlying FeatureRDD | ||
*/ | ||
def rdd: RDD[Coverage] = { | ||
featureRDD.rdd.flatMap(r => { | ||
val positions: List[Long] = List.range(r.getStart, r.getEnd) | ||
positions.map(n => Coverage(r.getContigName, n, r.getScore)) | ||
}) | ||
} | ||
} | ||
|
||
/** | ||
* Coverage Recordfor CoverageRDD | ||
* @param referenceName Specifies chromosomal location of coverage | ||
* @param position Specifies position of coverage | ||
* @param count Specifies count of coverage at location | ||
*/ | ||
case class Coverage(referenceName: String, position: Long, count: Double) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
53 changes: 53 additions & 0 deletions
53
adam-core/src/test/scala/org/bdgenomics/adam/rdd/features/CoverageRDDSuite.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
/** | ||
* Licensed to Big Data Genomics (BDG) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The BDG licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.bdgenomics.adam.rdd.features | ||
|
||
import com.google.common.collect.ImmutableMap | ||
import java.io.File | ||
import org.apache.spark.rdd.RDD | ||
import org.bdgenomics.adam.models.{ ReferenceRegion, SequenceRecord, SequenceDictionary } | ||
import org.bdgenomics.adam.rdd.ADAMContext._ | ||
import org.bdgenomics.adam.util.ADAMFunSuite | ||
import org.bdgenomics.formats.avro.{ Feature, Strand } | ||
import org.scalactic.{ Equivalence, TypeCheckedTripleEquals } | ||
|
||
class CoverageRDDSuite extends ADAMFunSuite with TypeCheckedTripleEquals { | ||
|
||
def tempLocation(suffix: String = ".adam"): String = { | ||
val tempFile = File.createTempFile("FeatureRDDFunctionsSuite", "") | ||
val tempDir = tempFile.getParentFile | ||
new File(tempDir, tempFile.getName + suffix).getAbsolutePath | ||
} | ||
|
||
val sd = new SequenceDictionary(Vector(SequenceRecord("chr1", 2000L), | ||
SequenceRecord("chr2", 20000L))) | ||
|
||
sparkTest("correctly flatmaps coverage") { | ||
val f1 = Feature.newBuilder().setContigName("chr1").setStart(1).setEnd(10).setScore(3.0).build() | ||
val f2 = Feature.newBuilder().setContigName("chr1").setStart(15).setEnd(20).setScore(2.0).build() | ||
val f3 = Feature.newBuilder().setContigName("chr2").setStart(15).setEnd(20).setScore(2.0).build() | ||
|
||
val featureRDD: FeatureRDD = FeatureRDD(sc.parallelize(Seq(f1, f2, f3)), sd) | ||
val coverageRDD: CoverageRDD = CoverageRDD(featureRDD) | ||
|
||
val region = ReferenceRegion("chr1", 5, 17) | ||
val coverage = coverageRDD.getCoverage(region) | ||
assert(coverage.count == 7) | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters