-
Notifications
You must be signed in to change notification settings - Fork 29
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #53 from pellucidanalytics/topic/csv
Better CSV support
- Loading branch information
Showing
15 changed files
with
947 additions
and
420 deletions.
There are no files selected for viewing
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
package framian | ||
package csv | ||
|
||
import spire.std.int._ | ||
import spire.std.string._ | ||
|
||
import framian.column.ColumnBuilder | ||
|
||
sealed abstract class Csv { | ||
val format: CsvFormat | ||
val rows: Vector[Either[CsvError, CsvRow]] | ||
|
||
lazy val data: Vector[CsvRow] = | ||
rows.collect { case Right(row) => row } | ||
lazy val errors: Vector[CsvError] = | ||
rows.collect { case Left(error) => error } | ||
def hasErrors: Boolean = !errors.isEmpty | ||
|
||
def unlabeled: UnlabeledCsv = this match { | ||
case csv @ UnlabeledCsv(_, _) => | ||
csv | ||
case LabeledCsv(format, _, rows) => | ||
UnlabeledCsv(format.copy(header = false), rows) | ||
} | ||
|
||
def labeled: LabeledCsv = this match { | ||
case csv @ LabeledCsv(_, _, _) => | ||
csv | ||
case UnlabeledCsv(format, rows) => | ||
val format0 = format.copy(header = true) | ||
rows.headOption.flatMap(_.right.toOption).map { hdr => | ||
LabeledCsv(format0, hdr.text(format), rows.tail) | ||
}.getOrElse { | ||
LabeledCsv(format0, Vector.empty, Vector.empty) | ||
} | ||
} | ||
|
||
override def toString: String = { | ||
val full = this match { | ||
case LabeledCsv(_, header, _) => | ||
CsvRow(header map (CsvCell.Data(_))) +: data | ||
case UnlabeledCsv(_, _) => | ||
data | ||
} | ||
|
||
full.iterator. | ||
map(_ render format). | ||
mkString(format.rowDelim.value) | ||
} | ||
} | ||
|
||
case class LabeledCsv(format: CsvFormat, header: Vector[String], rows: Vector[Either[CsvError, CsvRow]]) extends Csv { | ||
def toFrame: Frame[Int, String] = | ||
Csv.toFrame(data).withColIndex(Index.fromKeys(header: _*)) | ||
} | ||
|
||
case class UnlabeledCsv(format: CsvFormat, rows: Vector[Either[CsvError, CsvRow]]) extends Csv { | ||
def toFrame: Frame[Int, Int] = | ||
Csv.toFrame(data) | ||
} | ||
|
||
object Csv { | ||
val BufferSize = 32 * 1024 | ||
|
||
def empty(format: CsvFormat): Csv = | ||
if (format.header) LabeledCsv(format, Vector.empty, Vector.empty) | ||
else UnlabeledCsv(format, Vector.empty) | ||
|
||
private[csv] def toFrame(rows: Vector[CsvRow]): Frame[Int, Int] = { | ||
val cols = rows.foldLeft(Map.empty[Int,(ColumnBuilder[BigDecimal],ColumnBuilder[String])]) { (acc0, row) => | ||
row.cells.zipWithIndex.foldLeft(acc0) { case (acc, (cell, colIdx)) => | ||
val (numCol, strCol) = acc.getOrElse(colIdx, (Column.newBuilder[BigDecimal](), Column.newBuilder[String]())) | ||
cell match { | ||
case CsvCell.Data(value) => | ||
numCol += scala.util.Try(BigDecimal(value)).map(Value(_)).getOrElse(NA) | ||
strCol.addValue(value) | ||
case CsvCell.Empty => | ||
numCol.addNA() | ||
strCol.addNA() | ||
case CsvCell.Invalid => | ||
numCol.addNM() | ||
strCol.addNM() | ||
} | ||
acc + (colIdx -> (numCol, strCol)) | ||
} | ||
} | ||
|
||
val columns = Column.eval(cols.map { case (col, (numCol, strCol)) => | ||
col -> Value(TypedColumn(numCol.result()) orElse TypedColumn(strCol.result())) | ||
}) | ||
|
||
ColOrientedFrame( | ||
Index(Array.range(0, rows.size)), | ||
Index(Array.range(0, cols.size)), | ||
columns | ||
) | ||
} | ||
|
||
def fromFrame[Col](format: CsvFormat)(frame: Frame[_, Col]): Csv = { | ||
val rows = frame.get(Cols.all[Col].as[CsvRow]).denseIterator.map { | ||
case (_, row) => Right(row) | ||
}.toVector | ||
|
||
if (format.header) { | ||
val header = frame.colIndex.toVector.map(_._1.toString) | ||
LabeledCsv(format, header, rows) | ||
} else { | ||
UnlabeledCsv(format, rows) | ||
} | ||
} | ||
|
||
import java.nio.charset.{ Charset, StandardCharsets } | ||
import java.io.File | ||
import java.io.{ InputStream, FileInputStream } | ||
import java.io.{ Reader, InputStreamReader, StringReader } | ||
|
||
def parseReader(reader: Reader, format: CsvFormatStrategy = CsvFormat.Guess): Csv = { | ||
val (format0, reader0) = format match { | ||
case (guess: GuessCsvFormat) => guess(reader) | ||
case (fmt: CsvFormat) => (fmt, reader) | ||
} | ||
CsvParser(format0).parseReader(reader0) | ||
} | ||
|
||
def parseString(input: String, format: CsvFormatStrategy = CsvFormat.Guess): Csv = | ||
parseReader(new StringReader(input), format) | ||
|
||
def parseInputStream(is: InputStream, format: CsvFormatStrategy = CsvFormat.Guess, charset: Charset = StandardCharsets.UTF_8): Csv = | ||
parseReader(new InputStreamReader(is, charset), format) | ||
|
||
def parseFile(file: File, format: CsvFormatStrategy = CsvFormat.Guess, charset: Charset = StandardCharsets.UTF_8): Csv = | ||
parseInputStream(new FileInputStream(file), format, charset) | ||
|
||
def parsePath(filename: String, format: CsvFormatStrategy = CsvFormat.Guess, charset: Charset = StandardCharsets.UTF_8): Csv = | ||
parseFile(new File(filename), format, charset) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package framian | ||
package csv | ||
|
||
import spire.syntax.monoid._ | ||
|
||
sealed abstract class CsvCell { | ||
def render(format: CsvFormat): String | ||
} | ||
|
||
object CsvCell { | ||
case class Data(value: String) extends CsvCell { | ||
def render(format: CsvFormat): String = format.render(value) | ||
override def toString: String = value | ||
} | ||
case object Empty extends CsvCell { | ||
def render(format: CsvFormat): String = format.empty | ||
override def toString: String = "-" | ||
} | ||
case object Invalid extends CsvCell { | ||
def render(format: CsvFormat): String = format.invalid | ||
override def toString: String = "<error>" | ||
} | ||
|
||
def fromNonValue(nonValue: NonValue): CsvCell = nonValue match { | ||
case NA => Empty | ||
case NM => Invalid | ||
} | ||
|
||
implicit val CsvCellColumnTyper: ColumnTyper[CsvCell] = new ColumnTyper[CsvCell] { | ||
def cast(col: TypedColumn[_]): Column[CsvCell] = { | ||
val num = col.cast[BigDecimal] map (n => Data(n.toString): CsvCell) | ||
val text = col.cast[String] map (Data(_): CsvCell) | ||
val any = col.cast[Any] map (any => Data(any.toString): CsvCell) | ||
num |+| text |+| any | ||
} | ||
} | ||
} |
Oops, something went wrong.