Skip to content

Commit

Permalink
Remove workaround for gzip/BGZF compressed VCF headers
Browse files Browse the repository at this point in the history
  • Loading branch information
heuermh committed Jul 1, 2016
1 parent 8abb47a commit 964a42b
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 29 deletions.
33 changes: 7 additions & 26 deletions adam-core/src/main/scala/org/bdgenomics/adam/rdd/ADAMContext.scala
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ import org.bdgenomics.utils.io.LocalFileByteAccess
import org.bdgenomics.utils.misc.HadoopUtil
import org.bdgenomics.utils.misc.Logging
import org.seqdoop.hadoop_bam._
import org.seqdoop.hadoop_bam.util.{ BGZFCodec, SAMHeaderReader, VCFHeaderReader, WrapSeekable }
import org.seqdoop.hadoop_bam.util.{ BGZFCodec, BGZFEnhancedGzipCodec, SAMHeaderReader, VCFHeaderReader, WrapSeekable }
import scala.collection.JavaConversions._
import scala.collection.Map
import scala.reflect.ClassTag
Expand Down Expand Up @@ -149,30 +149,9 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
(sd, samples)
}

try {
val vcfHeader = VCFHeaderReader.readHeaderFrom(WrapSeekable.openPath(sc.hadoopConfiguration,
new Path(filePath)))
headerToMetadata(vcfHeader)
} catch {
case e: Throwable => {

// due to a bug upstream in Hadoop-BAM, the VCFHeaderReader class errors when reading
// headers from .vcf.gz files
//
// to WAR this, we read a record from the file using the input format, which correctly
// determines the VCF input type. calling first should lead to us only reading a single record.
log.warn("Caught exception (%s) when trying to load VCF metadata. Retrying via read as RDD.".format(e))
val vcfHeader = readVcfRecords(filePath)
.map(v => {
v._2
.get
.asInstanceOf[VariantContextWithHeader]
.getHeader
}).first

headerToMetadata(vcfHeader)
}
}
val vcfHeader = VCFHeaderReader.readHeaderFrom(WrapSeekable.openPath(sc.hadoopConfiguration,
new Path(filePath)))
headerToMetadata(vcfHeader)
}

private[rdd] def loadAvroSequences(filePath: String): SequenceDictionary = {
Expand Down Expand Up @@ -627,7 +606,9 @@ class ADAMContext(@transient val sc: SparkContext) extends Serializable with Log
private def readVcfRecords(filePath: String): RDD[(LongWritable, VariantContextWritable)] = {
// load vcf data
val job = HadoopUtil.newJob(sc)
job.getConfiguration().set("io.compression.codecs", classOf[BGZFCodec].getCanonicalName())
job.getConfiguration().setStrings("io.compression.codecs",
classOf[BGZFCodec].getCanonicalName(),
classOf[BGZFEnhancedGzipCodec].getCanonicalName())
sc.newAPIHadoopFile(
filePath,
classOf[VCFInputFormat], classOf[LongWritable], classOf[VariantContextWritable],
Expand Down
Binary file added adam-core/src/test/resources/test.vcf.bgz
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -278,12 +278,18 @@ class ADAMContextSuite extends ADAMFunSuite {
assert(vcs.count === 6)
}

sparkTest("can read a BGZF gzipped .vcf file") {
sparkTest("can read a BGZF gzipped .vcf file with .gz file extension") {
val path = resourcePath("test.vcf.bgzf.gz")
val vcs = sc.loadVcf(path, None)
assert(vcs.count === 6)
}

sparkTest("can read a BGZF gzipped .vcf file with .bgz file extension") {
val path = resourcePath("test.vcf.bgz")
val vcs = sc.loadVcf(path, None)
assert(vcs.count === 6)
}

ignore("can read an uncompressed BCFv2.2 file") { // see https://github.com/samtools/htsjdk/issues/507
val path = resourcePath("test.uncompressed.bcf")
val vcs = sc.loadVcf(path, None)
Expand Down
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@
<parquet.version>1.8.1</parquet.version>
<!-- Edit the following line to configure the Hadoop (HDFS) version. -->
<hadoop.version>2.6.0</hadoop.version>
<hadoop-bam.version>7.5.0</hadoop-bam.version>
<hadoop-bam.version>7.6.0</hadoop-bam.version>
<scoverage.version>1.1.1</scoverage.version>
<slf4j.version>1.7.21</slf4j.version>
<bdg-formats.version>0.9.0</bdg-formats.version>
<bdg-utils.version>0.2.7</bdg-utils.version>
<htsjdk.version>2.3.0</htsjdk.version>
<htsjdk.version>2.5.0</htsjdk.version>
</properties>

<modules>
Expand Down

0 comments on commit 964a42b

Please sign in to comment.