Skip to content

Commit

Permalink
Merge pull request #53 from pellucidanalytics/topic/csv
Browse files Browse the repository at this point in the history
Better CSV support
  • Loading branch information
tixxit committed Oct 29, 2014
2 parents 5982d83 + 6d86b79 commit efb5d77
Show file tree
Hide file tree
Showing 15 changed files with 947 additions and 420 deletions.
177 changes: 0 additions & 177 deletions framian/src/main/scala/framian/FrameUtils.scala

This file was deleted.

2 changes: 0 additions & 2 deletions framian/src/main/scala/framian/NumericColumnTyper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ object NumericColumnTyper {
if (x.isExact) rational(x.toRational)
else bigFloat(x.toBigDecimal)
}
case (x: String) => string(x)
case _ => z
}
}
Expand All @@ -135,7 +134,6 @@ object NumericColumnTyper {
case Classes.BigDecimal => bigFloat(column.asInstanceOf[Column[BigDecimal]])
case Classes.JavaBigDecimal => bigFloat(column.asInstanceOf[Column[java.math.BigDecimal]] map (BigDecimal(_)))
case cls if Classes.Rational isAssignableFrom cls => rational(column.asInstanceOf[Column[Rational]])
case Classes.String => string(column.asInstanceOf[Column[String]])
case _ => z
}
}
Expand Down
13 changes: 7 additions & 6 deletions framian/src/main/scala/framian/UntypedColumn.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,18 @@ import framian.column._
*/
trait UntypedColumn extends ColumnLike[UntypedColumn] {
def cast[A: ColumnTyper]: Column[A]
def orElse(that: UntypedColumn): UntypedColumn = MergedUntypedColumn(this, that)
def orElse(that: UntypedColumn): UntypedColumn = (this, that) match {
case (EmptyUntypedColumn, _) => that
case (_, EmptyUntypedColumn) => this
case _ => MergedUntypedColumn(this, that)
}
}

object UntypedColumn {
implicit object monoid extends Monoid[UntypedColumn] {
def id: UntypedColumn = empty
def op(lhs: UntypedColumn, rhs: UntypedColumn): UntypedColumn = (lhs, rhs) match {
case (EmptyUntypedColumn, _) => rhs
case (_, EmptyUntypedColumn) => lhs
case _ => MergedUntypedColumn(lhs, rhs)
}
def op(lhs: UntypedColumn, rhs: UntypedColumn): UntypedColumn =
lhs orElse rhs
}

final def empty: UntypedColumn = EmptyUntypedColumn
Expand Down
136 changes: 136 additions & 0 deletions framian/src/main/scala/framian/csv/Csv.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
package framian
package csv

import spire.std.int._
import spire.std.string._

import framian.column.ColumnBuilder

sealed abstract class Csv {
val format: CsvFormat
val rows: Vector[Either[CsvError, CsvRow]]

lazy val data: Vector[CsvRow] =
rows.collect { case Right(row) => row }
lazy val errors: Vector[CsvError] =
rows.collect { case Left(error) => error }
def hasErrors: Boolean = !errors.isEmpty

def unlabeled: UnlabeledCsv = this match {
case csv @ UnlabeledCsv(_, _) =>
csv
case LabeledCsv(format, _, rows) =>
UnlabeledCsv(format.copy(header = false), rows)
}

def labeled: LabeledCsv = this match {
case csv @ LabeledCsv(_, _, _) =>
csv
case UnlabeledCsv(format, rows) =>
val format0 = format.copy(header = true)
rows.headOption.flatMap(_.right.toOption).map { hdr =>
LabeledCsv(format0, hdr.text(format), rows.tail)
}.getOrElse {
LabeledCsv(format0, Vector.empty, Vector.empty)
}
}

override def toString: String = {
val full = this match {
case LabeledCsv(_, header, _) =>
CsvRow(header map (CsvCell.Data(_))) +: data
case UnlabeledCsv(_, _) =>
data
}

full.iterator.
map(_ render format).
mkString(format.rowDelim.value)
}
}

case class LabeledCsv(format: CsvFormat, header: Vector[String], rows: Vector[Either[CsvError, CsvRow]]) extends Csv {
def toFrame: Frame[Int, String] =
Csv.toFrame(data).withColIndex(Index.fromKeys(header: _*))
}

case class UnlabeledCsv(format: CsvFormat, rows: Vector[Either[CsvError, CsvRow]]) extends Csv {
def toFrame: Frame[Int, Int] =
Csv.toFrame(data)
}

object Csv {
val BufferSize = 32 * 1024

def empty(format: CsvFormat): Csv =
if (format.header) LabeledCsv(format, Vector.empty, Vector.empty)
else UnlabeledCsv(format, Vector.empty)

private[csv] def toFrame(rows: Vector[CsvRow]): Frame[Int, Int] = {
val cols = rows.foldLeft(Map.empty[Int,(ColumnBuilder[BigDecimal],ColumnBuilder[String])]) { (acc0, row) =>
row.cells.zipWithIndex.foldLeft(acc0) { case (acc, (cell, colIdx)) =>
val (numCol, strCol) = acc.getOrElse(colIdx, (Column.newBuilder[BigDecimal](), Column.newBuilder[String]()))
cell match {
case CsvCell.Data(value) =>
numCol += scala.util.Try(BigDecimal(value)).map(Value(_)).getOrElse(NA)
strCol.addValue(value)
case CsvCell.Empty =>
numCol.addNA()
strCol.addNA()
case CsvCell.Invalid =>
numCol.addNM()
strCol.addNM()
}
acc + (colIdx -> (numCol, strCol))
}
}

val columns = Column.eval(cols.map { case (col, (numCol, strCol)) =>
col -> Value(TypedColumn(numCol.result()) orElse TypedColumn(strCol.result()))
})

ColOrientedFrame(
Index(Array.range(0, rows.size)),
Index(Array.range(0, cols.size)),
columns
)
}

def fromFrame[Col](format: CsvFormat)(frame: Frame[_, Col]): Csv = {
val rows = frame.get(Cols.all[Col].as[CsvRow]).denseIterator.map {
case (_, row) => Right(row)
}.toVector

if (format.header) {
val header = frame.colIndex.toVector.map(_._1.toString)
LabeledCsv(format, header, rows)
} else {
UnlabeledCsv(format, rows)
}
}

import java.nio.charset.{ Charset, StandardCharsets }
import java.io.File
import java.io.{ InputStream, FileInputStream }
import java.io.{ Reader, InputStreamReader, StringReader }

def parseReader(reader: Reader, format: CsvFormatStrategy = CsvFormat.Guess): Csv = {
val (format0, reader0) = format match {
case (guess: GuessCsvFormat) => guess(reader)
case (fmt: CsvFormat) => (fmt, reader)
}
CsvParser(format0).parseReader(reader0)
}

def parseString(input: String, format: CsvFormatStrategy = CsvFormat.Guess): Csv =
parseReader(new StringReader(input), format)

def parseInputStream(is: InputStream, format: CsvFormatStrategy = CsvFormat.Guess, charset: Charset = StandardCharsets.UTF_8): Csv =
parseReader(new InputStreamReader(is, charset), format)

def parseFile(file: File, format: CsvFormatStrategy = CsvFormat.Guess, charset: Charset = StandardCharsets.UTF_8): Csv =
parseInputStream(new FileInputStream(file), format, charset)

def parsePath(filename: String, format: CsvFormatStrategy = CsvFormat.Guess, charset: Charset = StandardCharsets.UTF_8): Csv =
parseFile(new File(filename), format, charset)
}
37 changes: 37 additions & 0 deletions framian/src/main/scala/framian/csv/CsvCell.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package framian
package csv

import spire.syntax.monoid._

sealed abstract class CsvCell {
def render(format: CsvFormat): String
}

object CsvCell {
case class Data(value: String) extends CsvCell {
def render(format: CsvFormat): String = format.render(value)
override def toString: String = value
}
case object Empty extends CsvCell {
def render(format: CsvFormat): String = format.empty
override def toString: String = "-"
}
case object Invalid extends CsvCell {
def render(format: CsvFormat): String = format.invalid
override def toString: String = "<error>"
}

def fromNonValue(nonValue: NonValue): CsvCell = nonValue match {
case NA => Empty
case NM => Invalid
}

implicit val CsvCellColumnTyper: ColumnTyper[CsvCell] = new ColumnTyper[CsvCell] {
def cast(col: TypedColumn[_]): Column[CsvCell] = {
val num = col.cast[BigDecimal] map (n => Data(n.toString): CsvCell)
val text = col.cast[String] map (Data(_): CsvCell)
val any = col.cast[Any] map (any => Data(any.toString): CsvCell)
num |+| text |+| any
}
}
}
Loading

0 comments on commit efb5d77

Please sign in to comment.