Merge ede036d into 5359a75

bigdatagenomics · Sep 12, 2019 · 40fae63 · 40fae63
2 parents 5359a75 + ede036d
commit 40fae63
Show file tree

Hide file tree

Showing 2 changed files with 76 additions and 0 deletions.
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/FASTAInFormatter.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/FASTAInFormatter.scala
@@ -0,0 +1,70 @@
+/**
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.bdgenomics.adam.rdd.sequence
+
+import java.io.OutputStream
+import org.apache.hadoop.conf.Configuration
+import org.bdgenomics.adam.rdd.{ InFormatter, InFormatterCompanion }
+import org.bdgenomics.adam.sql.{ Sequence => SequenceProduct }
+import org.bdgenomics.formats.avro.Sequence
+
+/**
+ * InFormatter companion that creates an InFormatter that writes FASTA.
+ */
+object FASTAInFormatter extends InFormatterCompanion[Sequence, SequenceProduct, SequenceDataset, FASTAInFormatter] {
+
+ /**
+ * Builds a FASTAInFormatter to write FASTA.
+ *
+ * @param gDataset GenomicDataset of Sequences. Used to get HadoopConfiguration.
+ * @return Returns a new FASTA InFormatter.
+ */
+ def apply(gDataset: SequenceDataset): FASTAInFormatter = {
+ new FASTAInFormatter(gDataset.rdd.context.hadoopConfiguration)
+ }
+}
+
+class FASTAInFormatter private (
+ conf: Configuration) extends InFormatter[Sequence, SequenceProduct, SequenceDataset, FASTAInFormatter] {
+
+ protected val companion = FASTAInFormatter
+ private val lineWidth = conf.getInteger(SequenceDataset.FASTA_LINE_WIDTH, 60)
+
+ /**
+ * Writes sequences to an output stream in FASTA format.
+ *
+ * @param os An OutputStream connected to a process we are piping to.
+ * @param iter An iterator of records to write.
+ */
+ def write(os: OutputStream, iter: Iterator[Sequence]) {
+ def toFasta(sequence: Sequence): String = {
+ val sb = new StringBuilder()
+ sb.append(">")
+ sb.append(sequence.getName)
+ Option(sequence.getDescription).foreach(n => sb.append(" ").append(n))
+ sequence.getSequence.grouped(lineWidth).foreach(line => {
+ sb.append("\n")
+ sb.append(line)
+ })
+ sb.append("\n")
+ sb.toString
+ }
+
+ iter.foreach(sequence => os.write(toFasta(sequence).getBytes))
+ }
+}
diff --git a/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/SequenceDataset.scala b/adam-core/src/main/scala/org/bdgenomics/adam/rdd/sequence/SequenceDataset.scala
@@ -74,6 +74,12 @@ private[adam] class SequenceArraySerializer extends IntervalArraySerializer[Refe
 
 object SequenceDataset {
 
+ /**
+ * Hadoop configuration path to specify line width at
+ * which to hard wrap FASTA formatted sequences. Defaults to 60.
+ */
+ val FASTA_LINE_WIDTH = "org.bdgenomics.adam.rdd.sequence.SequenceDataset.lineWidth"
+
  /**
  * A genomic dataset that wraps a dataset of Sequence data.
  *