From 1953dd776851ff32b11676895972b9c34ce9502f Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Tue, 22 Jul 2014 16:00:00 -0400 Subject: [PATCH 01/13] First pass at better CSV parser. --- .../src/main/scala/framian/csv/CsvError.scala | 3 + .../main/scala/framian/csv/CsvFormat.scala | 111 +++++++ .../main/scala/framian/csv/CsvLoader.scala | 2 + .../main/scala/framian/csv/CsvParser.scala | 306 ++++++++++++++++++ .../scala/framian/csv/CsvRowExtractor.scala | 42 --- .../src/main/scala/framian/csv/Input.scala | 46 +++ .../main/scala/framian/csv/ParseResult.scala | 14 + .../main/scala/framian/csv/ParserState.scala | 21 ++ 8 files changed, 503 insertions(+), 42 deletions(-) create mode 100644 framian/src/main/scala/framian/csv/CsvError.scala create mode 100644 framian/src/main/scala/framian/csv/CsvFormat.scala create mode 100644 framian/src/main/scala/framian/csv/CsvLoader.scala create mode 100644 framian/src/main/scala/framian/csv/CsvParser.scala create mode 100644 framian/src/main/scala/framian/csv/Input.scala create mode 100644 framian/src/main/scala/framian/csv/ParseResult.scala create mode 100644 framian/src/main/scala/framian/csv/ParserState.scala diff --git a/framian/src/main/scala/framian/csv/CsvError.scala b/framian/src/main/scala/framian/csv/CsvError.scala new file mode 100644 index 0000000..4e92609 --- /dev/null +++ b/framian/src/main/scala/framian/csv/CsvError.scala @@ -0,0 +1,3 @@ +package framian.csv + +case class CsvError(message: String, pos: Long, context: String, row: Long, col: Long) diff --git a/framian/src/main/scala/framian/csv/CsvFormat.scala b/framian/src/main/scala/framian/csv/CsvFormat.scala new file mode 100644 index 0000000..c18f2ce --- /dev/null +++ b/framian/src/main/scala/framian/csv/CsvFormat.scala @@ -0,0 +1,111 @@ +package framian.csv + +import java.io.{ Reader, PushbackReader } + +sealed abstract class CsvRowDelim(val value: String, val alternate: Option[String] = None) +object CsvRowDelim { + case object Unix extends CsvRowDelim("\n") + case object Windows extends CsvRowDelim("\r\n") + case object Both extends CsvRowDelim("\n", Some("\r\n")) +} + +case class CsvFormat( + separator: String, + quote: String = "\"", + quoteEscape: String = "\"", + empty: String = "NA", + invalid: String = "NM", + header: Boolean = true, + rowDelim: CsvRowDelim = CsvRowDelim.Both +) { + val escapedQuote = quoteEscape + quote + + /** + * Replaces all instances of \r\n with \n, then escapes all quotes and wraps + * the string in quotes. + */ + def escape(text: String): String = { + val text0 = text.replace("\r\n", "\n").replace(quote, escapedQuote) + s"${quote}$text0${quote}" + } + + /** + * Renders a single cell of data, escaping the value if necessary. + */ + def render(text: String): String = { + if ((text contains '\n') || + (text contains separator) || + (text contains quote)) escape(text) + else text + } +} + +object CsvFormat { + val BufferSize = 32 * 1024 + + val CSV = CsvFormat(",") + val TSV = CsvFormat("\t") + + /** + * Makes a guess at the format of the CSV accessed by `reader`. This returns + * the format, as well as the a new pushback reader to be used in place of + * `reader`. The original reader will have some data read out of it. The + * returned reader will contain all the original reader's data. + */ + def guess(reader: Reader): (CsvFormat, Reader) = { + val reader0 = new PushbackReader(reader, BufferSize) + val buffer = new Array[Char](BufferSize) + val len = reader0.read(buffer) + reader0.unread(buffer, 0, len) + + val chunk = new String(buffer, 0, len) + val format = guess(chunk) + (format, reader0) + } + + /** + * Performs a very naive guess of the CsvFormat. This uses weighted + * frequencies of occurences of common separators, row-delimiters, quotes, + * quote escapes, etc. and simply selects the max for each. + */ + def guess(str: String): CsvFormat = { + def count(ndl: String): Int = { + def check(i: Int, j: Int = 0): Boolean = + if (j >= ndl.length) true + else if (i < str.length && str.charAt(i) == ndl.charAt(j)) check(i + 1, j + 1) + else false + + def loop(i: Int, cnt: Int): Int = + if (i < str.length) { + loop(i + 1, if (check(i)) cnt + 1 else cnt) + } else cnt + + loop(0, 0) + } + + def choose(weightedOptions: (String, Double)*): String = { + val weights = Map(weightedOptions: _*) + val (best, weight) = weights.maxBy { case (c, w) => w * count(c) } + if (weight > 0) best else weights.maxBy(_._2)._1 + } + + val windCnt = count("\r\n") + val unixCnt = count("\n") + val rowDelim = + if ((windCnt < 4 * unixCnt) && (unixCnt < 4 * windCnt)) CsvRowDelim.Both + else if (windCnt < 4 * unixCnt) CsvRowDelim.Unix + else CsvRowDelim.Windows + val separator = choose( + "," -> 1.0, + "\t" -> 3.0, + ";" -> 2.0, + "|" -> 3.0 + ) + val quote = choose("\"" -> 1.2, "\'" -> 1) + val quoteEscape = choose(s"$quote$quote" -> 1.1, s"\\$quote" -> 1).dropRight(quote.length) + val empty = choose("-" -> 1.5, "N/A" -> 2, "NA" -> 1) + val invalid = choose("N/M" -> 2, "NaN" -> 1) + + CsvFormat(separator, quote, quoteEscape, empty, invalid, true, rowDelim) + } +} diff --git a/framian/src/main/scala/framian/csv/CsvLoader.scala b/framian/src/main/scala/framian/csv/CsvLoader.scala new file mode 100644 index 0000000..af90e6f --- /dev/null +++ b/framian/src/main/scala/framian/csv/CsvLoader.scala @@ -0,0 +1,2 @@ +package framian.csv + diff --git a/framian/src/main/scala/framian/csv/CsvParser.scala b/framian/src/main/scala/framian/csv/CsvParser.scala new file mode 100644 index 0000000..9ab24c5 --- /dev/null +++ b/framian/src/main/scala/framian/csv/CsvParser.scala @@ -0,0 +1,306 @@ +package framian.csv + +import java.nio.charset.{ Charset, StandardCharsets } +import java.io.File +import java.io.{ InputStream, FileInputStream } +import java.io.{ Reader, InputStreamReader } + +// case class Csv(format: CsvFormat, header: Option[CsvRow], data: Vector[CsvRow]) { +// } +// +// sealed trait CsvCell { +// def value: String +// def isNumeric: Boolean +// def toDouble: Option[Double] +// def toBigDecimal: Option[Double] +// } +// +// object CsvCell { +// case class Unquoted(value: String) extends CsvCell +// case class Quoted(value: String) extends CsvCell +// case object Empty extends CsvCell("") +// case object Invalid extends CsvCell("") +// } +// + +case class CsvParser(format: CsvFormat) { + import ParserState._ + import Instr._ + + def mkError(input: Input, s0: ParserState, s1: ParserState, row: Long, msg: String, pos: Long): CsvError = { + val context = input.substring(s0.input.mark, s1.input.mark) + CsvError(msg, pos, context, row, pos - s0.input.mark + 1) + } + + def parseResource[A](a: A, close: A => Unit)(read: A => Option[String]): Vector[Either[CsvError, Vector[String]]] = { + def loop(s0: ParserState, row: Long, acc: Vector[Either[CsvError, Vector[String]]]): Vector[Either[CsvError, Vector[String]]] = { + val (s1, instr) = parse(s0) + + instr match { + case Emit(cells) => + loop(s1, row + 1, acc :+ Right(cells)) + case Fail(msg, pos) => + loop(s1, row + 1, acc :+ Left(mkError(s1.input, s0, s1, row, msg, pos))) + case Resume => + loop(s1, row, acc) + case NeedInput => + read(a) match { + case Some(chunk) => + loop(s1.mapInput(_.append(chunk)), row, acc) + case None => + loop(s1.mapInput(_.finished), row, acc) + } + case Done => + acc + } + } + + try { + read(a).map { input0 => + loop(ParseRow(Input.init(input0)), 1L, Vector.empty) + }.getOrElse { + Vector.empty + } + } finally { + try { + close(a) + } catch { case (_: Exception) => + // Do nothing - hopefully letting original exception through. + } + } + } + + def parseReader(reader: Reader): Vector[Either[CsvError, Vector[String]]] = { + val buffer = new Array[Char](CsvParser.BufferSize) + parseResource[Reader](reader, _.close()) { reader => + val len = reader.read(buffer) + if (len >= 0) { + Some(new String(buffer, 0, len)) + } else { + None + } + } + } + + def parseInputStream(is: InputStream, charset: Charset = StandardCharsets.UTF_8): Vector[Either[CsvError, Vector[String]]] = + parseReader(new InputStreamReader(is, charset)) + + def parseFile(file: File, charset: Charset = StandardCharsets.UTF_8): Vector[Either[CsvError, Vector[String]]] = + parseInputStream(new FileInputStream(file), charset) + + def parseString(input: String): Vector[Either[CsvError, Vector[String]]] = { + var next: Option[String] = Some(input) + parseResource[Unit]((), _ => ()) { _ => + val chunk = next; next = None; chunk + } + } + + private def parse(state: ParserState): (ParserState, Instr[Vector[String]]) = { + import format._ + + val input: Input = state.input + var pos: Long = input.mark + def ch: Char = input.charAt(pos) + def endOfInput: Boolean = pos >= input.length + def endOfFile: Boolean = endOfInput && input.isLast + def advance(i: Long = 1): Unit = pos += i + def retreat(i: Long = 1): Unit = pos -= i + + def isFlag(str: String): () => Int = { + def loop(i: Int): Int = + if (i >= str.length) { + retreat(i) + i + } else if (endOfInput) { + retreat(i) + if (endOfFile) 0 else -1 + } else if (str.charAt(i) == ch) { + advance() + loop(i + 1) + } else { + retreat(i) + 0 + } + + () => loop(0) + } + + def either(f0: () => Int, f1: () => Int): () => Int = { () => + val i = f0() + if (i == 0) f1() else i + } + + val isQuote = isFlag(quote) + val isQuoteEscape = isFlag(quoteEscape) + val isSeparator = isFlag(separator) + val isRowDelim = rowDelim.alternate.map { alt => + either(isFlag(rowDelim.value), isFlag(alt)) + }.getOrElse(isFlag(rowDelim.value)) + val isEndOfCell = either(isSeparator, isRowDelim) + def isEscapedQuote() = { + val e = isQuoteEscape() + if (e > 0) { + advance(e) + val q = isQuote() + retreat(e) + if (q > 0) q + e + else q + } else { + e + } + } + + def unquotedCell(): ParseResult[String] = { + val start = pos + def loop(): ParseResult[String] = { + val flag = isEndOfCell() + if (flag > 0 || endOfFile) { + Emit(input.substring(start, pos)) + } else if (flag == 0) { + advance() + loop() + } else { + NeedInput + } + } + + loop() + } + + def quotedCell(): ParseResult[String] = { + val start = pos + def loop(): ParseResult[String] = { + if (endOfInput) { + if (endOfFile) { + Fail("Unmatched quoted string at end of file", pos) + } else { + NeedInput + } + } else { + val d = isRowDelim() + val e = isEscapedQuote() + val q = isQuote() + + if (d < 0 || e < 0 || q < 0) { + NeedInput + } else if (d > 0) { + Fail("Unmatched quoted string at row delimiter", pos) + } else if (e > 0) { + advance(e) + loop() + } else if (q > 0) { + val escaped = input.substring(start, pos) + advance(q) + Emit(escaped) + } else { + advance(1) + loop() + } + } + } + + loop() + } + + def cell(): ParseResult[String] = { + val q = isQuote() + if (q == 0) { + unquotedCell() + } else if (q > 0) { + advance(q) + quotedCell() + } else { + NeedInput + } + } + + def skipToNextRow(): Boolean = { + val d = isRowDelim() + if (d == 0) { + advance(1) + skipToNextRow() + } else if (d > 0) { + advance(d) + true + } else { + if (input.isLast) + advance(input.length - pos) + input.isLast + } + } + + def row(cells: Vector[String]): (ParserState, Instr[Vector[String]]) = { + val start = pos + def needInput() = (ContinueRow(cells, input.marked(start)), NeedInput) + + val s = isSeparator() + if (s == 0) { + val r = isRowDelim() + if (r > 0 || endOfFile) { + advance(r) + (ParseRow(input.marked(pos)), Emit(cells)) + } else if (r == 0) { + (SkipRow(input.marked(pos)), Fail("Expected separator, row delimiter, or end of file", pos)) + } else { + needInput() + } + } else if (s > 0) { + advance(s) + cell() match { + case Emit(str) => + row(cells :+ str) + case f @ Fail(_, _) => + (SkipRow(input.marked(pos)), f) + case NeedInput => + needInput() + } + } else { + needInput() + } + } + + state match { + case ContinueRow(partial, _) => + row(partial) + + case instr @ ParseRow(_) => + if (endOfFile) { + (instr, Done) + } else { + cell() match { + case Emit(csvCell) => + row(Vector(csvCell)) + case f @ Fail(_, _) => + (SkipRow(input.marked(pos)), f) + case NeedInput => + (instr, NeedInput) + } + } + + case SkipRow(_) => + if (skipToNextRow()) { + (ParseRow(input.marked(pos)), Resume) + } else { + (SkipRow(input.marked(pos)), NeedInput) + } + } + } +} + +object CsvParser { + val BufferSize = 32 * 1024 + + def parseString(input: String): Vector[Either[CsvError, Vector[String]]] = + CsvParser(CsvFormat.guess(input)).parseString(input) + + def parseReader(reader: Reader): Vector[Either[CsvError, Vector[String]]] = { + val (format, reader0) = CsvFormat.guess(reader) + CsvParser(format).parseReader(reader0) + } + + def parseInputStream(is: InputStream, charset: Charset = StandardCharsets.UTF_8): Vector[Either[CsvError, Vector[String]]] = + parseReader(new InputStreamReader(is, charset)) + + def parseFile(file: File, charset: Charset = StandardCharsets.UTF_8): Vector[Either[CsvError, Vector[String]]] = + parseInputStream(new FileInputStream(file), charset) +} diff --git a/framian/src/main/scala/framian/csv/CsvRowExtractor.scala b/framian/src/main/scala/framian/csv/CsvRowExtractor.scala index fff0d98..67afc65 100644 --- a/framian/src/main/scala/framian/csv/CsvRowExtractor.scala +++ b/framian/src/main/scala/framian/csv/CsvRowExtractor.scala @@ -25,48 +25,6 @@ package csv import spire.syntax.monoid._ import org.joda.time._ -sealed abstract class CsvRowDelim(val value: String) -object CsvRowDelim { - case object Unix extends CsvRowDelim("\n") - case object Windows extends CsvRowDelim("\r\n") -} - -case class CsvFormat( - separator: String, - quote: String = "\"", - quoteEscape: String = "\"", - empty: String = "NA", - invalid: String = "NM", - header: Boolean = true, - rowDelim: CsvRowDelim = CsvRowDelim.Windows -) { - val escapedQuote = quoteEscape + quote - - /** - * Replaces all instances of \r\n with \n, then escapes all quotes and wraps - * the string in quotes. - */ - def escape(text: String): String = { - val text0 = text.replace("\r\n", "\n").replace(quote, escapedQuote) - s"${quote}$text0${quote}" - } - - /** - * Renders a single cell of data, escaping the value if necessary. - */ - def render(text: String): String = { - if ((text contains '\n') || - (text contains separator) || - (text contains quote)) escape(text) - else text - } -} - -object CsvFormat { - val CSV = CsvFormat(",") - val TSV = CsvFormat("\t") -} - sealed abstract class CsvCell(val render: CsvFormat => String) object CsvCell { diff --git a/framian/src/main/scala/framian/csv/Input.scala b/framian/src/main/scala/framian/csv/Input.scala new file mode 100644 index 0000000..2904f8b --- /dev/null +++ b/framian/src/main/scala/framian/csv/Input.scala @@ -0,0 +1,46 @@ +package framian.csv + +case class Input(offset: Long, data: String, isLast: Boolean, mark: Long) { + private def check(i: Long): Int = if ((i < offset) || (i > (offset + data.length))) { + throw new IndexOutOfBoundsException() + } else { + val j = i - offset + if (j <= Int.MaxValue) { + j.toInt + } else { + throw new IndexOutOfBoundsException() + } + } + + def charAt(i: Long): Char = data.charAt(check(i)) + + def length: Long = offset + data.length + + def substring(from: Long, until: Long): String = + data.substring(check(from), check(until)) + + def marked(pos: Long): Input = + Input(offset, data, isLast, pos) + + private def trim: Input = if (mark > offset) { + val next = spire.math.min(mark - offset, data.length.toLong).toInt + val tail = data.substring(next) + val offset0 = offset + next + Input(offset0, tail, isLast, offset0) + } else this + + def append(chunk: String, last: Boolean = false): Input = + if (mark > offset) trim.append(chunk, last) + else if (chunk.isEmpty) Input(offset, data, last, mark) + else Input(offset, data + chunk, last, mark) + + def finished: Input = Input(offset, data, true, mark) +} + +object Input { + def fromString(str: String): Input = + Input(0, str, true, 0) + + def init(str: String): Input = + Input(0, str, false, 0) +} diff --git a/framian/src/main/scala/framian/csv/ParseResult.scala b/framian/src/main/scala/framian/csv/ParseResult.scala new file mode 100644 index 0000000..3925013 --- /dev/null +++ b/framian/src/main/scala/framian/csv/ParseResult.scala @@ -0,0 +1,14 @@ +package framian.csv + +sealed trait Instr[+A] + +object Instr { + sealed trait ParseResult[+A] extends Instr[A] + + case class Emit[+A](value: A) extends ParseResult[A] + case class Fail(message: String, pos: Long) extends ParseResult[Nothing] + case object NeedInput extends ParseResult[Nothing] + + case object Resume extends Instr[Nothing] + case object Done extends Instr[Nothing] +} diff --git a/framian/src/main/scala/framian/csv/ParserState.scala b/framian/src/main/scala/framian/csv/ParserState.scala new file mode 100644 index 0000000..9b8f29e --- /dev/null +++ b/framian/src/main/scala/framian/csv/ParserState.scala @@ -0,0 +1,21 @@ +package framian.csv + +sealed trait ParserState { + import ParserState._ + + def input: Input + + def withInput(input0: Input): ParserState = this match { + case ContinueRow(partial, _) => ContinueRow(partial, input0) + case SkipRow(_) => SkipRow(input0) + case ParseRow(_) => ParseRow(input0) + } + + def mapInput(f: Input => Input): ParserState = withInput(f(input)) +} + +object ParserState { + case class ContinueRow(partial: Vector[String], input: Input) extends ParserState + case class SkipRow(input: Input) extends ParserState + case class ParseRow(input: Input) extends ParserState +} From 1ca1b8252f9f684aa71deb5062c74813bd9d6366 Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Wed, 23 Jul 2014 22:15:43 -0600 Subject: [PATCH 02/13] Csv -> Frame support and bug fixes. --- framian/src/main/scala/framian/Frame.scala | 2 +- .../main/scala/framian/UntypedColumn.scala | 12 ++- framian/src/main/scala/framian/csv/Csv.scala | 93 +++++++++++++++++++ .../src/main/scala/framian/csv/CsvCell.scala | 37 ++++++++ .../main/scala/framian/csv/CsvFormat.scala | 28 ++++-- .../main/scala/framian/csv/CsvParser.scala | 91 +++++++----------- .../src/main/scala/framian/csv/CsvRow.scala | 24 +++++ .../scala/framian/csv/CsvRowExtractor.scala | 63 ------------- .../main/scala/framian/csv/ParserState.scala | 2 +- 9 files changed, 217 insertions(+), 135 deletions(-) create mode 100644 framian/src/main/scala/framian/csv/Csv.scala create mode 100644 framian/src/main/scala/framian/csv/CsvCell.scala create mode 100644 framian/src/main/scala/framian/csv/CsvRow.scala diff --git a/framian/src/main/scala/framian/Frame.scala b/framian/src/main/scala/framian/Frame.scala index a3b7fcc..87e7099 100644 --- a/framian/src/main/scala/framian/Frame.scala +++ b/framian/src/main/scala/framian/Frame.scala @@ -235,7 +235,7 @@ trait Frame[Row, Col] { } Column.fromCells(cells.toVector) } - Series(rowIndex, column) + Series(rowIndex.resetIndices, column) } def getRow(key: Row): Option[Rec[Col]] = rowIndex.get(key) map Rec.fromRow(this) diff --git a/framian/src/main/scala/framian/UntypedColumn.scala b/framian/src/main/scala/framian/UntypedColumn.scala index 1648e0c..ac2df37 100644 --- a/framian/src/main/scala/framian/UntypedColumn.scala +++ b/framian/src/main/scala/framian/UntypedColumn.scala @@ -36,16 +36,18 @@ import shapeless.syntax.typeable._ */ trait UntypedColumn extends ColumnLike[UntypedColumn] { def cast[A: ColumnTyper]: Column[A] + def orElse(that: UntypedColumn): UntypedColumn = (this, that) match { + case (EmptyUntypedColumn, _) => that + case (_, EmptyUntypedColumn) => this + case _ => MergedUntypedColumn(this, that) + } } object UntypedColumn { implicit object monoid extends Monoid[UntypedColumn] { def id: UntypedColumn = empty - def op(lhs: UntypedColumn, rhs: UntypedColumn): UntypedColumn = (lhs, rhs) match { - case (EmptyUntypedColumn, _) => rhs - case (_, EmptyUntypedColumn) => lhs - case _ => MergedUntypedColumn(lhs, rhs) - } + def op(lhs: UntypedColumn, rhs: UntypedColumn): UntypedColumn = + lhs orElse rhs } final def empty: UntypedColumn = EmptyUntypedColumn diff --git a/framian/src/main/scala/framian/csv/Csv.scala b/framian/src/main/scala/framian/csv/Csv.scala new file mode 100644 index 0000000..8971275 --- /dev/null +++ b/framian/src/main/scala/framian/csv/Csv.scala @@ -0,0 +1,93 @@ +package framian +package csv + +import spire.std.int._ +import spire.std.string._ + +final case class Csv(format: CsvFormat, header: Option[Vector[String]], rows: Vector[Either[CsvError, CsvRow]]) { + def toFrame: Frame[Int, Int] = { + val validRows = rows.collect { case Right(row) => row } + val cols = validRows.foldLeft(Map.empty[Int,(ColumnBuilder[BigDecimal],ColumnBuilder[String])]) { (acc0, row) => + row.cells.zipWithIndex.foldLeft(acc0) { case (acc, (cell, colIdx)) => + val (numCol, strCol) = acc.getOrElse(colIdx, (new ColumnBuilder[BigDecimal], new ColumnBuilder[String])) + cell match { + case CsvCell.Data(value) => + numCol += scala.util.Try(BigDecimal(value)).map(Value(_)).getOrElse(NA) + strCol.addValue(value) + case CsvCell.Empty => + numCol.addNA() + strCol.addNA() + case CsvCell.Invalid => + numCol.addNM() + strCol.addNM() + } + acc + (colIdx -> (numCol, strCol)) + } + } + + val columns = Column.fromMap(cols.map { case (col, (numCol, strCol)) => + col -> (TypedColumn(numCol.result()) orElse TypedColumn(strCol.result())) + }) + + Frame.fromColumns( + Index(Array.range(0, validRows.size)), + Index(Array.range(0, cols.size)), + columns + ) + } + + def toLabeledFrame: Frame[Int, String] = header map { header => + toFrame.withColIndex(Index.fromKeys(header: _*)) + } getOrElse ??? + + override def toString: String = { + val full = header filter (_ => format.header) map { headings => + Right(CsvRow(headings map (CsvCell.Data(_)))) +: rows + } getOrElse rows + + full.iterator. + collect { case Right(row) => row }. + map(_ render format). + mkString(format.rowDelim.value) + } +} + +object Csv { + val BufferSize = 32 * 1024 + + def empty(format: CsvFormat): Csv = Csv(format, None, Vector.empty) + + def fromFrame[Col](format: CsvFormat)(frame: Frame[_, Col]): Csv = { + val header = if (format.header) { + Some(frame.colIndex.toVector map (_._1.toString)) + } else { + None + } + val rows = frame.get(Cols.all[Col].as[CsvRow]).denseIterator.map { + case (_, row) => Right(row) + }.toVector + Csv(format, header, rows) + } + + import java.nio.charset.{ Charset, StandardCharsets } + import java.io.File + import java.io.{ InputStream, FileInputStream } + import java.io.{ Reader, InputStreamReader } + + def parseString(input: String): Csv = + CsvParser(CsvFormat.guess(input)).parseString(input) + + def parseReader(reader: Reader): Csv = { + val (format, reader0) = CsvFormat.guess(reader) + CsvParser(format).parseReader(reader0) + } + + def parseInputStream(is: InputStream, charset: Charset = StandardCharsets.UTF_8): Csv = + parseReader(new InputStreamReader(is, charset)) + + def parseFile(file: File, charset: Charset = StandardCharsets.UTF_8): Csv = + parseInputStream(new FileInputStream(file), charset) + + def parse(filename: String, charset: Charset = StandardCharsets.UTF_8): Csv = + parseFile(new File(filename), charset) +} diff --git a/framian/src/main/scala/framian/csv/CsvCell.scala b/framian/src/main/scala/framian/csv/CsvCell.scala new file mode 100644 index 0000000..50559c4 --- /dev/null +++ b/framian/src/main/scala/framian/csv/CsvCell.scala @@ -0,0 +1,37 @@ +package framian +package csv + +import spire.syntax.monoid._ + +sealed abstract class CsvCell { + def render(format: CsvFormat): String +} + +object CsvCell { + case class Data(value: String) extends CsvCell { + def render(format: CsvFormat): String = format.render(value) + override def toString: String = value + } + case object Empty extends CsvCell { + def render(format: CsvFormat): String = format.empty + override def toString: String = "-" + } + case object Invalid extends CsvCell { + def render(format: CsvFormat): String = format.invalid + override def toString: String = "" + } + + def fromNonValue(nonValue: NonValue): CsvCell = nonValue match { + case NA => Empty + case NM => Invalid + } + + implicit object CsvCellColumnTyper extends ColumnTyper[CsvCell] { + def cast(col: TypedColumn[_]): Column[CsvCell] = { + val num = col.cast[BigDecimal] map (n => Data(n.toString): CsvCell) + val text = col.cast[String] map (Data(_): CsvCell) + val any = col.cast[Any] map (any => Data(any.toString): CsvCell) + num |+| text |+| any + } + } +} diff --git a/framian/src/main/scala/framian/csv/CsvFormat.scala b/framian/src/main/scala/framian/csv/CsvFormat.scala index c18f2ce..5999af4 100644 --- a/framian/src/main/scala/framian/csv/CsvFormat.scala +++ b/framian/src/main/scala/framian/csv/CsvFormat.scala @@ -15,7 +15,7 @@ case class CsvFormat( quoteEscape: String = "\"", empty: String = "NA", invalid: String = "NM", - header: Boolean = true, + header: Boolean = false, rowDelim: CsvRowDelim = CsvRowDelim.Both ) { val escapedQuote = quoteEscape + quote @@ -41,8 +41,6 @@ case class CsvFormat( } object CsvFormat { - val BufferSize = 32 * 1024 - val CSV = CsvFormat(",") val TSV = CsvFormat("\t") @@ -53,8 +51,8 @@ object CsvFormat { * returned reader will contain all the original reader's data. */ def guess(reader: Reader): (CsvFormat, Reader) = { - val reader0 = new PushbackReader(reader, BufferSize) - val buffer = new Array[Char](BufferSize) + val reader0 = new PushbackReader(reader, Csv.BufferSize) + val buffer = new Array[Char](Csv.BufferSize) val len = reader0.read(buffer) reader0.unread(buffer, 0, len) @@ -103,9 +101,25 @@ object CsvFormat { ) val quote = choose("\"" -> 1.2, "\'" -> 1) val quoteEscape = choose(s"$quote$quote" -> 1.1, s"\\$quote" -> 1).dropRight(quote.length) - val empty = choose("-" -> 1.5, "N/A" -> 2, "NA" -> 1) + val empty = choose("?" -> 1.5, "-" -> 1.5, "N/A" -> 2, "NA" -> 1) val invalid = choose("N/M" -> 2, "NaN" -> 1) - CsvFormat(separator, quote, quoteEscape, empty, invalid, true, rowDelim) + val headerEnd = str.indexOf(rowDelim.value) + val header = if (headerEnd > 0) { + import spire.std.map._ + import spire.std.double._ + import spire.syntax.all._ + + val (hdr, rows) = str.replace(separator, "").splitAt(headerEnd) + def mkVec(s: String): Map[Char, Double] = + s.groupBy(c => c).map { case (k, v) => k -> v.length.toDouble }.normalize + def similarity[K](x: Map[K, Double], y: Map[K, Double]): Double = (x dot y) / (x.norm * y.norm) + + similarity(mkVec(hdr), mkVec(rows)) < 0.5 + } else { + false + } + + CsvFormat(separator, quote, quoteEscape, empty, invalid, header, rowDelim) } } diff --git a/framian/src/main/scala/framian/csv/CsvParser.scala b/framian/src/main/scala/framian/csv/CsvParser.scala index 9ab24c5..7ab9ebe 100644 --- a/framian/src/main/scala/framian/csv/CsvParser.scala +++ b/framian/src/main/scala/framian/csv/CsvParser.scala @@ -5,24 +5,6 @@ import java.io.File import java.io.{ InputStream, FileInputStream } import java.io.{ Reader, InputStreamReader } -// case class Csv(format: CsvFormat, header: Option[CsvRow], data: Vector[CsvRow]) { -// } -// -// sealed trait CsvCell { -// def value: String -// def isNumeric: Boolean -// def toDouble: Option[Double] -// def toBigDecimal: Option[Double] -// } -// -// object CsvCell { -// case class Unquoted(value: String) extends CsvCell -// case class Quoted(value: String) extends CsvCell -// case object Empty extends CsvCell("") -// case object Invalid extends CsvCell("") -// } -// - case class CsvParser(format: CsvFormat) { import ParserState._ import Instr._ @@ -32,8 +14,8 @@ case class CsvParser(format: CsvFormat) { CsvError(msg, pos, context, row, pos - s0.input.mark + 1) } - def parseResource[A](a: A, close: A => Unit)(read: A => Option[String]): Vector[Either[CsvError, Vector[String]]] = { - def loop(s0: ParserState, row: Long, acc: Vector[Either[CsvError, Vector[String]]]): Vector[Either[CsvError, Vector[String]]] = { + def parseResource[A](a: A, close: A => Unit)(read: A => Option[String]): Csv = { + def loop(s0: ParserState, row: Long, acc: Vector[Either[CsvError, CsvRow]]): Csv = { val (s1, instr) = parse(s0) instr match { @@ -51,7 +33,13 @@ case class CsvParser(format: CsvFormat) { loop(s1.mapInput(_.finished), row, acc) } case Done => - acc + val (hdr, rows) = if (format.header) { + acc.headOption match { + case Some(Right(row)) => (Some(row.text(format)), acc.tail) + case _ => (None, acc) + } + } else (None, acc) + Csv(format, hdr, rows) } } @@ -59,7 +47,7 @@ case class CsvParser(format: CsvFormat) { read(a).map { input0 => loop(ParseRow(Input.init(input0)), 1L, Vector.empty) }.getOrElse { - Vector.empty + Csv.empty(format) } } finally { try { @@ -70,8 +58,8 @@ case class CsvParser(format: CsvFormat) { } } - def parseReader(reader: Reader): Vector[Either[CsvError, Vector[String]]] = { - val buffer = new Array[Char](CsvParser.BufferSize) + def parseReader(reader: Reader): Csv = { + val buffer = new Array[Char](Csv.BufferSize) parseResource[Reader](reader, _.close()) { reader => val len = reader.read(buffer) if (len >= 0) { @@ -82,20 +70,20 @@ case class CsvParser(format: CsvFormat) { } } - def parseInputStream(is: InputStream, charset: Charset = StandardCharsets.UTF_8): Vector[Either[CsvError, Vector[String]]] = + def parseInputStream(is: InputStream, charset: Charset = StandardCharsets.UTF_8): Csv = parseReader(new InputStreamReader(is, charset)) - def parseFile(file: File, charset: Charset = StandardCharsets.UTF_8): Vector[Either[CsvError, Vector[String]]] = + def parseFile(file: File, charset: Charset = StandardCharsets.UTF_8): Csv = parseInputStream(new FileInputStream(file), charset) - def parseString(input: String): Vector[Either[CsvError, Vector[String]]] = { + def parseString(input: String): Csv = { var next: Option[String] = Some(input) parseResource[Unit]((), _ => ()) { _ => val chunk = next; next = None; chunk } } - private def parse(state: ParserState): (ParserState, Instr[Vector[String]]) = { + private def parse(state: ParserState): (ParserState, Instr[CsvRow]) = { import format._ val input: Input = state.input @@ -150,12 +138,17 @@ case class CsvParser(format: CsvFormat) { } } - def unquotedCell(): ParseResult[String] = { + def unquotedCell(): ParseResult[CsvCell] = { val start = pos - def loop(): ParseResult[String] = { + def loop(): ParseResult[CsvCell] = { val flag = isEndOfCell() if (flag > 0 || endOfFile) { - Emit(input.substring(start, pos)) + val value = input.substring(start, pos) + val csvCell = + if (value == empty) CsvCell.Empty + else if (value == invalid) CsvCell.Invalid + else CsvCell.Data(value) + Emit(csvCell) } else if (flag == 0) { advance() loop() @@ -167,9 +160,9 @@ case class CsvParser(format: CsvFormat) { loop() } - def quotedCell(): ParseResult[String] = { + def quotedCell(): ParseResult[CsvCell] = { val start = pos - def loop(): ParseResult[String] = { + def loop(): ParseResult[CsvCell] = { if (endOfInput) { if (endOfFile) { Fail("Unmatched quoted string at end of file", pos) @@ -189,9 +182,9 @@ case class CsvParser(format: CsvFormat) { advance(e) loop() } else if (q > 0) { - val escaped = input.substring(start, pos) + val escaped = input.substring(start, pos).replace(escapedQuote, quote) advance(q) - Emit(escaped) + Emit(CsvCell.Data(escaped)) } else { advance(1) loop() @@ -202,7 +195,7 @@ case class CsvParser(format: CsvFormat) { loop() } - def cell(): ParseResult[String] = { + def cell(): ParseResult[CsvCell] = { val q = isQuote() if (q == 0) { unquotedCell() @@ -229,7 +222,7 @@ case class CsvParser(format: CsvFormat) { } } - def row(cells: Vector[String]): (ParserState, Instr[Vector[String]]) = { + def row(cells: Vector[CsvCell]): (ParserState, Instr[CsvRow]) = { val start = pos def needInput() = (ContinueRow(cells, input.marked(start)), NeedInput) @@ -238,7 +231,7 @@ case class CsvParser(format: CsvFormat) { val r = isRowDelim() if (r > 0 || endOfFile) { advance(r) - (ParseRow(input.marked(pos)), Emit(cells)) + (ParseRow(input.marked(pos)), Emit(new CsvRow(cells))) } else if (r == 0) { (SkipRow(input.marked(pos)), Fail("Expected separator, row delimiter, or end of file", pos)) } else { @@ -247,8 +240,8 @@ case class CsvParser(format: CsvFormat) { } else if (s > 0) { advance(s) cell() match { - case Emit(str) => - row(cells :+ str) + case Emit(c) => + row(cells :+ c) case f @ Fail(_, _) => (SkipRow(input.marked(pos)), f) case NeedInput => @@ -286,21 +279,3 @@ case class CsvParser(format: CsvFormat) { } } } - -object CsvParser { - val BufferSize = 32 * 1024 - - def parseString(input: String): Vector[Either[CsvError, Vector[String]]] = - CsvParser(CsvFormat.guess(input)).parseString(input) - - def parseReader(reader: Reader): Vector[Either[CsvError, Vector[String]]] = { - val (format, reader0) = CsvFormat.guess(reader) - CsvParser(format).parseReader(reader0) - } - - def parseInputStream(is: InputStream, charset: Charset = StandardCharsets.UTF_8): Vector[Either[CsvError, Vector[String]]] = - parseReader(new InputStreamReader(is, charset)) - - def parseFile(file: File, charset: Charset = StandardCharsets.UTF_8): Vector[Either[CsvError, Vector[String]]] = - parseInputStream(new FileInputStream(file), charset) -} diff --git a/framian/src/main/scala/framian/csv/CsvRow.scala b/framian/src/main/scala/framian/csv/CsvRow.scala new file mode 100644 index 0000000..ecee77d --- /dev/null +++ b/framian/src/main/scala/framian/csv/CsvRow.scala @@ -0,0 +1,24 @@ +package framian +package csv + +/** + * A single row in a CSV file. + */ +final class CsvRow(val cells: Vector[CsvCell]) extends AnyVal { + def text(format: CsvFormat): Vector[String] = cells.map(_ render format) + + def render(format: CsvFormat): String = + cells.iterator map (_ render format) mkString format.separator + + override def toString: String = + cells.mkString("CsvRow(", ", ", ")") +} + +object CsvRow extends (Vector[CsvCell] => CsvRow) { + def apply(cells: Vector[CsvCell]): CsvRow = new CsvRow(cells) + + implicit def csvRowExtractor[Col]: RowExtractor[CsvRow, Col, Variable] = + RowExtractor.collectionOf[Vector, CsvCell, Col].map { cells => + CsvRow(cells.map(_.fold[CsvCell](CsvCell.Empty, CsvCell.Invalid)(cell => cell))) + } +} diff --git a/framian/src/main/scala/framian/csv/CsvRowExtractor.scala b/framian/src/main/scala/framian/csv/CsvRowExtractor.scala index 67afc65..b4ceb6a 100644 --- a/framian/src/main/scala/framian/csv/CsvRowExtractor.scala +++ b/framian/src/main/scala/framian/csv/CsvRowExtractor.scala @@ -23,66 +23,3 @@ package framian package csv import spire.syntax.monoid._ -import org.joda.time._ - -sealed abstract class CsvCell(val render: CsvFormat => String) - -object CsvCell { - case class Number(num: BigDecimal) extends CsvCell(_ render num.toString) - case class Text(value: String) extends CsvCell(_ render value) - case object Empty extends CsvCell(_.empty) - case object Invalid extends CsvCell(_.invalid) - - def fromNonValue(nonValue: NonValue): CsvCell = nonValue match { - case NA => Empty - case NM => Invalid - } - - implicit object CsvCellColumnTyper extends ColumnTyper[CsvCell] { - def cast(col: TypedColumn[_]): Column[CsvCell] = { - val num = col.cast[BigDecimal] map (Number(_): CsvCell) - val text = col.cast[String] map (Text(_): CsvCell) - val date = col.cast[LocalDate] map { date: LocalDate => Text(date.toString): CsvCell } - num |+| text |+| date - } - } -} - -/** - * A single row in a CSV file. - */ -final class CsvRow(val cells: List[CsvCell]) extends AnyVal { - def render(format: CsvFormat): String = - cells.iterator map (_ render format) mkString format.separator -} - -object CsvRow { - def apply(cells: List[CsvCell]): CsvRow = new CsvRow(cells) - - implicit object CsvRowExtractor extends RowExtractor[CsvRow, String, Variable] { - type P = List[Column[CsvCell]] - def prepare[Row](frame: Frame[Row, String], cols: List[String]): Option[List[Column[CsvCell]]] = - Some(cols map { key => frame.column[CsvCell](key)(CsvCell.CsvCellColumnTyper).column }) - def extract[Row](frame: Frame[Row, String], key: Row, row: Int, cols: List[Column[CsvCell]]): Cell[CsvRow] = - Value(CsvRow(cols map { _.foldRow(row)(a => a, CsvCell.fromNonValue) })) - } -} - -final case class Csv(header: Option[List[String]], rows: List[CsvRow]) { - def render(format: CsvFormat): String = { - val full = header filter (_ => format.header) map { headings => - CsvRow(headings map (CsvCell.Text(_))) :: rows - } getOrElse rows - full.iterator map (_ render format) mkString format.rowDelim.value - } -} - -object Csv { - def fromFrame(frame: Frame[_, String]): Csv = { - val header = frame.colIndex.toList map (_._1) - val rows = frame.get(Cols.all.as[CsvRow]).denseIterator.map { - case (_, row) => row - }.toList - Csv(Some(header), rows) - } -} diff --git a/framian/src/main/scala/framian/csv/ParserState.scala b/framian/src/main/scala/framian/csv/ParserState.scala index 9b8f29e..583fe96 100644 --- a/framian/src/main/scala/framian/csv/ParserState.scala +++ b/framian/src/main/scala/framian/csv/ParserState.scala @@ -15,7 +15,7 @@ sealed trait ParserState { } object ParserState { - case class ContinueRow(partial: Vector[String], input: Input) extends ParserState + case class ContinueRow(partial: Vector[CsvCell], input: Input) extends ParserState case class SkipRow(input: Input) extends ParserState case class ParseRow(input: Input) extends ParserState } From 784630a77c3883a41dcb9b6221148e03cb78ecce Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Thu, 24 Jul 2014 12:59:45 -0600 Subject: [PATCH 03/13] Add missing apply 3 case to AxisSelection. --- framian/src/main/scala/framian/AxisSelection.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/framian/src/main/scala/framian/AxisSelection.scala b/framian/src/main/scala/framian/AxisSelection.scala index abc4436..cce879c 100644 --- a/framian/src/main/scala/framian/AxisSelection.scala +++ b/framian/src/main/scala/framian/AxisSelection.scala @@ -99,6 +99,9 @@ trait AxisSelectionCompanion[AxisSelection[K, A] <: AxisSelectionLike[K, A, Axis def apply[K](c0: K, c1: K): Pick[K, Fixed[_2], Rec[K]] = sized(Sized[List](c0, c1)) + def apply[K](c0: K, c1: K, c2: K): Pick[K, Fixed[_3], Rec[K]] = + sized(Sized[List](c0, c1, c2)) + def apply[K](c0: K, c1: K, c2: K, c3: K): Pick[K, Fixed[_4], Rec[K]] = sized(Sized[List](c0, c1, c2, c3)) From 212821490c2f7007c59d173d88c9928a012710e8 Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Fri, 25 Jul 2014 10:55:54 -0600 Subject: [PATCH 04/13] Better guesses at empty/invalid cells. --- framian/src/main/scala/framian/csv/Csv.scala | 29 +-- .../main/scala/framian/csv/CsvFormat.scala | 182 +++++++++++------- 2 files changed, 129 insertions(+), 82 deletions(-) diff --git a/framian/src/main/scala/framian/csv/Csv.scala b/framian/src/main/scala/framian/csv/Csv.scala index 8971275..906dae0 100644 --- a/framian/src/main/scala/framian/csv/Csv.scala +++ b/framian/src/main/scala/framian/csv/Csv.scala @@ -72,22 +72,25 @@ object Csv { import java.nio.charset.{ Charset, StandardCharsets } import java.io.File import java.io.{ InputStream, FileInputStream } - import java.io.{ Reader, InputStreamReader } + import java.io.{ Reader, InputStreamReader, StringReader } - def parseString(input: String): Csv = - CsvParser(CsvFormat.guess(input)).parseString(input) - - def parseReader(reader: Reader): Csv = { - val (format, reader0) = CsvFormat.guess(reader) - CsvParser(format).parseReader(reader0) + def parseReader(reader: Reader, format: CsvFormatStrategy = CsvFormat.Guess): Csv = { + val (format0, reader0) = format match { + case (guess: GuessCsvFormat) => guess(reader) + case (fmt: CsvFormat) => (fmt, reader) + } + CsvParser(format0).parseReader(reader0) } - def parseInputStream(is: InputStream, charset: Charset = StandardCharsets.UTF_8): Csv = - parseReader(new InputStreamReader(is, charset)) + def parseString(input: String, format: CsvFormatStrategy = CsvFormat.Guess): Csv = + parseReader(new StringReader(input), format) + + def parseInputStream(is: InputStream, format: CsvFormatStrategy = CsvFormat.Guess, charset: Charset = StandardCharsets.UTF_8): Csv = + parseReader(new InputStreamReader(is, charset), format) - def parseFile(file: File, charset: Charset = StandardCharsets.UTF_8): Csv = - parseInputStream(new FileInputStream(file), charset) + def parseFile(file: File, format: CsvFormatStrategy = CsvFormat.Guess, charset: Charset = StandardCharsets.UTF_8): Csv = + parseInputStream(new FileInputStream(file), format, charset) - def parse(filename: String, charset: Charset = StandardCharsets.UTF_8): Csv = - parseFile(new File(filename), charset) + def parsePath(filename: String, format: CsvFormatStrategy = CsvFormat.Guess, charset: Charset = StandardCharsets.UTF_8): Csv = + parseFile(new File(filename), format, charset) } diff --git a/framian/src/main/scala/framian/csv/CsvFormat.scala b/framian/src/main/scala/framian/csv/CsvFormat.scala index 5999af4..7f4e2e4 100644 --- a/framian/src/main/scala/framian/csv/CsvFormat.scala +++ b/framian/src/main/scala/framian/csv/CsvFormat.scala @@ -1,6 +1,7 @@ package framian.csv import java.io.{ Reader, PushbackReader } +import java.util.regex.Pattern sealed abstract class CsvRowDelim(val value: String, val alternate: Option[String] = None) object CsvRowDelim { @@ -9,17 +10,47 @@ object CsvRowDelim { case object Both extends CsvRowDelim("\n", Some("\r\n")) } +sealed trait CsvFormatStrategy + +trait GuessCsvFormat extends CsvFormatStrategy { + + /** + * Makes a guess at the format of the CSV accessed by `reader`. This returns + * the format, as well as the a new pushback reader to be used in place of + * `reader`. The original reader will have some data read out of it. The + * returned reader will contain all the original reader's data. + */ + def apply(reader: Reader): (CsvFormat, Reader) = { + val reader0 = new PushbackReader(reader, Csv.BufferSize) + val buffer = new Array[Char](Csv.BufferSize) + val len = reader0.read(buffer) + reader0.unread(buffer, 0, len) + + val chunk = new String(buffer, 0, len) + val format = apply(chunk) + (format, reader0) + } + + /** + * Given the first part of a CSV file, return a guess at the format. + */ + def apply(str: String): CsvFormat +} + case class CsvFormat( separator: String, quote: String = "\"", quoteEscape: String = "\"", - empty: String = "NA", - invalid: String = "NM", + empty: String = "", + invalid: String = "", header: Boolean = false, rowDelim: CsvRowDelim = CsvRowDelim.Both -) { +) extends CsvFormatStrategy { val escapedQuote = quoteEscape + quote + override def toString: String = + s"""CsvFormat(separator = "$separator", quote = "$quote", quoteEscape = "$quoteEscape", empty = "$empty", invalid = "$invalid", header = $header, rowDelim = $rowDelim)""" + /** * Replaces all instances of \r\n with \n, then escapes all quotes and wraps * the string in quotes. @@ -44,82 +75,95 @@ object CsvFormat { val CSV = CsvFormat(",") val TSV = CsvFormat("\t") - /** - * Makes a guess at the format of the CSV accessed by `reader`. This returns - * the format, as well as the a new pushback reader to be used in place of - * `reader`. The original reader will have some data read out of it. The - * returned reader will contain all the original reader's data. - */ - def guess(reader: Reader): (CsvFormat, Reader) = { - val reader0 = new PushbackReader(reader, Csv.BufferSize) - val buffer = new Array[Char](Csv.BufferSize) - val len = reader0.read(buffer) - reader0.unread(buffer, 0, len) - - val chunk = new String(buffer, 0, len) - val format = guess(chunk) - (format, reader0) - } - - /** - * Performs a very naive guess of the CsvFormat. This uses weighted - * frequencies of occurences of common separators, row-delimiters, quotes, - * quote escapes, etc. and simply selects the max for each. - */ - def guess(str: String): CsvFormat = { - def count(ndl: String): Int = { - def check(i: Int, j: Int = 0): Boolean = - if (j >= ndl.length) true - else if (i < str.length && str.charAt(i) == ndl.charAt(j)) check(i + 1, j + 1) - else false - - def loop(i: Int, cnt: Int): Int = - if (i < str.length) { - loop(i + 1, if (check(i)) cnt + 1 else cnt) - } else cnt - - loop(0, 0) - } - - def choose(weightedOptions: (String, Double)*): String = { - val weights = Map(weightedOptions: _*) - val (best, weight) = weights.maxBy { case (c, w) => w * count(c) } - if (weight > 0) best else weights.maxBy(_._2)._1 + val Guess = Partial() + + case class Partial( + separator: Option[String] = None, + quote: Option[String] = None, + quoteEscape: Option[String] = None, + empty: Option[String] = None, + invalid: Option[String] = None, + header: Option[Boolean] = None, + rowDelim: Option[CsvRowDelim] = None + ) extends GuessCsvFormat { + + /** + * Performs a very naive guess of the CsvFormat. This uses weighted + * frequencies of occurences of common separators, row-delimiters, quotes, + * quote escapes, etc. and simply selects the max for each. + */ + def apply(str: String): CsvFormat = { + def count(ndl: String): Int = { + def check(i: Int, j: Int = 0): Boolean = + if (j >= ndl.length) true + else if (i < str.length && str.charAt(i) == ndl.charAt(j)) check(i + 1, j + 1) + else false + + def loop(i: Int, cnt: Int): Int = + if (i < str.length) { + loop(i + 1, if (check(i)) cnt + 1 else cnt) + } else cnt + + loop(0, 0) + } + + def choose(weightedOptions: (String, Double)*)(f: String => Int): String = { + val weights = Map(weightedOptions: _*) + val (best, weight) = weights.maxBy { case (c, w) => w * f(c) } + if (weight > 0) best else weights.maxBy(_._2)._1 + } + + val rowDelim0 = rowDelim.getOrElse { + val windCnt = count("\r\n") + val unixCnt = count("\n") + + if ((windCnt < 4 * unixCnt) && (unixCnt < 4 * windCnt)) CsvRowDelim.Both + else if (windCnt < 4 * unixCnt) CsvRowDelim.Unix + else CsvRowDelim.Windows + } + val separator0 = separator.getOrElse { + choose("," -> 1.0, "\t" -> 3.0, ";" -> 2.0, "|" -> 3.0)(count) + } + val quote0 = quote.getOrElse(choose("\"" -> 1.2, "\'" -> 1)(count)) + val quoteEscape0 = choose(s"$quote0$quote0" -> 1.1, s"\\$quote0" -> 1)(count).dropRight(quote0.length) + + val cells = for { + row0 <- str.split(Pattern.quote(rowDelim0.value)) + row <- rowDelim0.alternate.fold(Array(row0)) { alt => + row0.split(Pattern.quote(alt)) + } + cell <- row.split(Pattern.quote(separator0)) + } yield cell + def matches(value: String): Int = cells.filter(_ == value).size + val empty0 = empty.getOrElse { + choose("" -> 3, "?" -> 2, "-" -> 2, "N/A" -> 1, "NA" -> 1)(matches) + } + val invalid0 = invalid.getOrElse { + if (matches("N/M") > 1) "N/M" else empty0 + } + + val header0 = header.getOrElse(hasHeader(str, rowDelim0.value, separator0, quote0)) + + CsvFormat(separator0, quote0, quoteEscape0, empty0, invalid0, header0, rowDelim0) } - val windCnt = count("\r\n") - val unixCnt = count("\n") - val rowDelim = - if ((windCnt < 4 * unixCnt) && (unixCnt < 4 * windCnt)) CsvRowDelim.Both - else if (windCnt < 4 * unixCnt) CsvRowDelim.Unix - else CsvRowDelim.Windows - val separator = choose( - "," -> 1.0, - "\t" -> 3.0, - ";" -> 2.0, - "|" -> 3.0 - ) - val quote = choose("\"" -> 1.2, "\'" -> 1) - val quoteEscape = choose(s"$quote$quote" -> 1.1, s"\\$quote" -> 1).dropRight(quote.length) - val empty = choose("?" -> 1.5, "-" -> 1.5, "N/A" -> 2, "NA" -> 1) - val invalid = choose("N/M" -> 2, "NaN" -> 1) - - val headerEnd = str.indexOf(rowDelim.value) - val header = if (headerEnd > 0) { + def hasHeader(chunk: String, rowDelim: String, separator: String, quote: String): Boolean = { import spire.std.map._ import spire.std.double._ import spire.syntax.all._ - val (hdr, rows) = str.replace(separator, "").splitAt(headerEnd) def mkVec(s: String): Map[Char, Double] = s.groupBy(c => c).map { case (k, v) => k -> v.length.toDouble }.normalize + def similarity[K](x: Map[K, Double], y: Map[K, Double]): Double = (x dot y) / (x.norm * y.norm) - similarity(mkVec(hdr), mkVec(rows)) < 0.5 - } else { - false + val headerEnd = chunk.indexOf(rowDelim) + if (headerEnd > 0) { + val (hdr, rows) = chunk.replace(separator, "").replace(quote, "").splitAt(headerEnd) + similarity(mkVec(hdr), mkVec(rows)) < 0.5 + } else { + false + } } - - CsvFormat(separator, quote, quoteEscape, empty, invalid, header, rowDelim) } } From 72f8d5b5133d632244e9e5aff0ad9d43ac9e48e8 Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Fri, 25 Jul 2014 11:56:39 -0600 Subject: [PATCH 05/13] Deal with labeled frames in a sane way. --- framian/src/main/scala/framian/csv/Csv.scala | 102 ++++++++++++------ .../main/scala/framian/csv/CsvFormat.scala | 13 ++- .../main/scala/framian/csv/CsvParser.scala | 9 +- 3 files changed, 84 insertions(+), 40 deletions(-) diff --git a/framian/src/main/scala/framian/csv/Csv.scala b/framian/src/main/scala/framian/csv/Csv.scala index 906dae0..fdd4fc7 100644 --- a/framian/src/main/scala/framian/csv/Csv.scala +++ b/framian/src/main/scala/framian/csv/Csv.scala @@ -4,10 +4,68 @@ package csv import spire.std.int._ import spire.std.string._ -final case class Csv(format: CsvFormat, header: Option[Vector[String]], rows: Vector[Either[CsvError, CsvRow]]) { - def toFrame: Frame[Int, Int] = { - val validRows = rows.collect { case Right(row) => row } - val cols = validRows.foldLeft(Map.empty[Int,(ColumnBuilder[BigDecimal],ColumnBuilder[String])]) { (acc0, row) => +sealed abstract class Csv { + val format: CsvFormat + val rows: Vector[Either[CsvError, CsvRow]] + + lazy val data: Vector[CsvRow] = + rows.collect { case Right(row) => row } + lazy val errors: Vector[CsvError] = + rows.collect { case Left(error) => error } + def hasErrors: Boolean = !errors.isEmpty + + def unlabeled: UnlabeledCsv = this match { + case csv @ UnlabeledCsv(_, _) => + csv + case LabeledCsv(format, _, rows) => + UnlabeledCsv(format.copy(header = false), rows) + } + + def labeled: LabeledCsv = this match { + case csv @ LabeledCsv(_, _, _) => + csv + case UnlabeledCsv(format, rows) => + val format0 = format.copy(header = true) + rows.headOption.flatMap(_.right.toOption).map { hdr => + LabeledCsv(format0, hdr.text(format), rows.tail) + }.getOrElse { + LabeledCsv(format0, Vector.empty, Vector.empty) + } + } + + override def toString: String = { + val full = this match { + case LabeledCsv(_, header, _) => + CsvRow(header map (CsvCell.Data(_))) +: data + case UnlabeledCsv(_, _) => + data + } + + full.iterator. + map(_ render format). + mkString(format.rowDelim.value) + } +} + +case class LabeledCsv(format: CsvFormat, header: Vector[String], rows: Vector[Either[CsvError, CsvRow]]) extends Csv { + def toFrame: Frame[Int, String] = + Csv.toFrame(data).withColIndex(Index.fromKeys(header: _*)) +} + +case class UnlabeledCsv(format: CsvFormat, rows: Vector[Either[CsvError, CsvRow]]) extends Csv { + def toFrame: Frame[Int, Int] = + Csv.toFrame(data) +} + +object Csv { + val BufferSize = 32 * 1024 + + def empty(format: CsvFormat): Csv = + if (format.header) LabeledCsv(format, Vector.empty, Vector.empty) + else UnlabeledCsv(format, Vector.empty) + + private[csv] def toFrame(rows: Vector[CsvRow]): Frame[Int, Int] = { + val cols = rows.foldLeft(Map.empty[Int,(ColumnBuilder[BigDecimal],ColumnBuilder[String])]) { (acc0, row) => row.cells.zipWithIndex.foldLeft(acc0) { case (acc, (cell, colIdx)) => val (numCol, strCol) = acc.getOrElse(colIdx, (new ColumnBuilder[BigDecimal], new ColumnBuilder[String])) cell match { @@ -30,43 +88,23 @@ final case class Csv(format: CsvFormat, header: Option[Vector[String]], rows: Ve }) Frame.fromColumns( - Index(Array.range(0, validRows.size)), + Index(Array.range(0, rows.size)), Index(Array.range(0, cols.size)), columns ) } - def toLabeledFrame: Frame[Int, String] = header map { header => - toFrame.withColIndex(Index.fromKeys(header: _*)) - } getOrElse ??? - - override def toString: String = { - val full = header filter (_ => format.header) map { headings => - Right(CsvRow(headings map (CsvCell.Data(_)))) +: rows - } getOrElse rows - - full.iterator. - collect { case Right(row) => row }. - map(_ render format). - mkString(format.rowDelim.value) - } -} - -object Csv { - val BufferSize = 32 * 1024 - - def empty(format: CsvFormat): Csv = Csv(format, None, Vector.empty) - def fromFrame[Col](format: CsvFormat)(frame: Frame[_, Col]): Csv = { - val header = if (format.header) { - Some(frame.colIndex.toVector map (_._1.toString)) - } else { - None - } val rows = frame.get(Cols.all[Col].as[CsvRow]).denseIterator.map { case (_, row) => Right(row) }.toVector - Csv(format, header, rows) + + if (format.header) { + val header = frame.colIndex.toVector.map(_._1.toString) + LabeledCsv(format, header, rows) + } else { + UnlabeledCsv(format, rows) + } } import java.nio.charset.{ Charset, StandardCharsets } diff --git a/framian/src/main/scala/framian/csv/CsvFormat.scala b/framian/src/main/scala/framian/csv/CsvFormat.scala index 7f4e2e4..cf491d4 100644 --- a/framian/src/main/scala/framian/csv/CsvFormat.scala +++ b/framian/src/main/scala/framian/csv/CsvFormat.scala @@ -5,6 +5,7 @@ import java.util.regex.Pattern sealed abstract class CsvRowDelim(val value: String, val alternate: Option[String] = None) object CsvRowDelim { + case class Custom(delim: String) extends CsvRowDelim(delim) case object Unix extends CsvRowDelim("\n") case object Windows extends CsvRowDelim("\r\n") case object Both extends CsvRowDelim("\n", Some("\r\n")) @@ -75,7 +76,7 @@ object CsvFormat { val CSV = CsvFormat(",") val TSV = CsvFormat("\t") - val Guess = Partial() + val Guess = Partial(header = Some(false)) case class Partial( separator: Option[String] = None, @@ -87,6 +88,15 @@ object CsvFormat { rowDelim: Option[CsvRowDelim] = None ) extends GuessCsvFormat { + def withSeparator(separator: String): Partial = copy(separator = Some(separator)) + def withQuote(quote: String): Partial = copy(quote = Some(quote)) + def withQuoteEscape(quoteEscape: String): Partial = copy(quoteEscape = Some(quoteEscape)) + def withEmpty(empty: String): Partial = copy(empty = Some(empty)) + def withInvalid(invalid: String): Partial = copy(invalid = Some(invalid)) + def withHeader(header: Boolean): Partial = copy(header = Some(header)) + def withRowDelim(rowDelim: CsvRowDelim): Partial = copy(rowDelim = Some(rowDelim)) + def withRowDelim(rowDelim: String): Partial = copy(rowDelim = Some(CsvRowDelim.Custom(rowDelim))) + /** * Performs a very naive guess of the CsvFormat. This uses weighted * frequencies of occurences of common separators, row-delimiters, quotes, @@ -160,6 +170,7 @@ object CsvFormat { val headerEnd = chunk.indexOf(rowDelim) if (headerEnd > 0) { val (hdr, rows) = chunk.replace(separator, "").replace(quote, "").splitAt(headerEnd) + println(s"header = ${similarity(mkVec(hdr), mkVec(rows))}") similarity(mkVec(hdr), mkVec(rows)) < 0.5 } else { false diff --git a/framian/src/main/scala/framian/csv/CsvParser.scala b/framian/src/main/scala/framian/csv/CsvParser.scala index 7ab9ebe..56c4f2b 100644 --- a/framian/src/main/scala/framian/csv/CsvParser.scala +++ b/framian/src/main/scala/framian/csv/CsvParser.scala @@ -33,13 +33,8 @@ case class CsvParser(format: CsvFormat) { loop(s1.mapInput(_.finished), row, acc) } case Done => - val (hdr, rows) = if (format.header) { - acc.headOption match { - case Some(Right(row)) => (Some(row.text(format)), acc.tail) - case _ => (None, acc) - } - } else (None, acc) - Csv(format, hdr, rows) + val csv = UnlabeledCsv(format, acc) + if (format.header) csv.labeled else csv } } From cdb3f84d2521ce39878ec2d1be3b4d1f806cd104 Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Fri, 25 Jul 2014 15:25:01 -0600 Subject: [PATCH 06/13] Include whole row as context in CSV errors. --- .../src/main/scala/framian/csv/CsvError.scala | 11 ++- .../main/scala/framian/csv/CsvParser.scala | 77 +++++++++++-------- .../main/scala/framian/csv/ParserState.scala | 14 ++-- 3 files changed, 65 insertions(+), 37 deletions(-) diff --git a/framian/src/main/scala/framian/csv/CsvError.scala b/framian/src/main/scala/framian/csv/CsvError.scala index 4e92609..9333029 100644 --- a/framian/src/main/scala/framian/csv/CsvError.scala +++ b/framian/src/main/scala/framian/csv/CsvError.scala @@ -1,3 +1,12 @@ package framian.csv -case class CsvError(message: String, pos: Long, context: String, row: Long, col: Long) +case class CsvError(message: String, rowStart: Long, pos: Long, context: String, row: Long, col: Long) { + def description: String = { + val msg = s"Error parsing CSV row: $message" + val prefix = s"Row $row: " + val padLength = col.toInt - 1 + prefix.length + val pointer = (" " * padLength) + "^" + + s"$msg\n\n$prefix$context\n$pointer" + } +} diff --git a/framian/src/main/scala/framian/csv/CsvParser.scala b/framian/src/main/scala/framian/csv/CsvParser.scala index 56c4f2b..1a33574 100644 --- a/framian/src/main/scala/framian/csv/CsvParser.scala +++ b/framian/src/main/scala/framian/csv/CsvParser.scala @@ -9,29 +9,46 @@ case class CsvParser(format: CsvFormat) { import ParserState._ import Instr._ - def mkError(input: Input, s0: ParserState, s1: ParserState, row: Long, msg: String, pos: Long): CsvError = { - val context = input.substring(s0.input.mark, s1.input.mark) - CsvError(msg, pos, context, row, pos - s0.input.mark + 1) + private def removeRowDelim(context: String): String = { + def dropTail(tail: String): Option[String] = + if (context.endsWith(tail)) Some(context.dropRight(tail.length)) + else None + + dropTail(format.rowDelim.value). + orElse(format.rowDelim.alternate.flatMap(dropTail)). + getOrElse(context) } def parseResource[A](a: A, close: A => Unit)(read: A => Option[String]): Csv = { - def loop(s0: ParserState, row: Long, acc: Vector[Either[CsvError, CsvRow]]): Csv = { + def loop(s0: ParserState, fail: Option[Fail], row: Long, acc: Vector[Either[CsvError, CsvRow]]): Csv = { val (s1, instr) = parse(s0) instr match { case Emit(cells) => - loop(s1, row + 1, acc :+ Right(cells)) - case Fail(msg, pos) => - loop(s1, row + 1, acc :+ Left(mkError(s1.input, s0, s1, row, msg, pos))) + loop(s1, fail, row + 1, acc :+ Right(cells)) + + case f @ Fail(_, _) => + loop(s1, Some(f), row, acc) + case Resume => - loop(s1, row, acc) + fail match { + case Some(Fail(msg, pos)) => + val context = removeRowDelim(s1.input.substring(s0.rowStart, s1.rowStart)) + val error = CsvError(msg, s0.rowStart, pos, context, row, pos - s0.rowStart + 1) + loop(s1, None, row + 1, acc :+ Left(error)) + + case None => + loop(s1, None, row, acc) + } + case NeedInput => read(a) match { case Some(chunk) => - loop(s1.mapInput(_.append(chunk)), row, acc) + loop(s1.mapInput(_.append(chunk)), fail, row, acc) case None => - loop(s1.mapInput(_.finished), row, acc) + loop(s1.mapInput(_.finished), fail, row, acc) } + case Done => val csv = UnlabeledCsv(format, acc) if (format.header) csv.labeled else csv @@ -40,7 +57,7 @@ case class CsvParser(format: CsvFormat) { try { read(a).map { input0 => - loop(ParseRow(Input.init(input0)), 1L, Vector.empty) + loop(ParseRow(0L, 0L, Input.init(input0)), None, 1L, Vector.empty) }.getOrElse { Csv.empty(format) } @@ -82,7 +99,7 @@ case class CsvParser(format: CsvFormat) { import format._ val input: Input = state.input - var pos: Long = input.mark + var pos: Long = state.readFrom def ch: Char = input.charAt(pos) def endOfInput: Boolean = pos >= input.length def endOfFile: Boolean = endOfInput && input.isLast @@ -204,12 +221,12 @@ case class CsvParser(format: CsvFormat) { def skipToNextRow(): Boolean = { val d = isRowDelim() - if (d == 0) { - advance(1) - skipToNextRow() - } else if (d > 0) { + if (d > 0 || endOfFile) { advance(d) true + } else if (d == 0) { + advance(1) + skipToNextRow() } else { if (input.isLast) advance(input.length - pos) @@ -217,18 +234,18 @@ case class CsvParser(format: CsvFormat) { } } - def row(cells: Vector[CsvCell]): (ParserState, Instr[CsvRow]) = { + def row(rowStart: Long, cells: Vector[CsvCell]): (ParserState, Instr[CsvRow]) = { val start = pos - def needInput() = (ContinueRow(cells, input.marked(start)), NeedInput) + def needInput() = (ContinueRow(rowStart, start, cells, input), NeedInput) val s = isSeparator() if (s == 0) { val r = isRowDelim() if (r > 0 || endOfFile) { advance(r) - (ParseRow(input.marked(pos)), Emit(new CsvRow(cells))) + (ParseRow(pos, pos, input.marked(pos)), Emit(new CsvRow(cells))) } else if (r == 0) { - (SkipRow(input.marked(pos)), Fail("Expected separator, row delimiter, or end of file", pos)) + (SkipRow(rowStart, pos, input), Fail("Expected separator, row delimiter, or end of file", pos)) } else { needInput() } @@ -236,9 +253,9 @@ case class CsvParser(format: CsvFormat) { advance(s) cell() match { case Emit(c) => - row(cells :+ c) + row(rowStart, cells :+ c) case f @ Fail(_, _) => - (SkipRow(input.marked(pos)), f) + (SkipRow(rowStart, pos, input), f) case NeedInput => needInput() } @@ -248,28 +265,28 @@ case class CsvParser(format: CsvFormat) { } state match { - case ContinueRow(partial, _) => - row(partial) + case ContinueRow(rowStart, readFrom, partial, _) => + row(rowStart, partial) - case instr @ ParseRow(_) => + case instr @ ParseRow(rowStart, readFrom, _) => if (endOfFile) { (instr, Done) } else { cell() match { case Emit(csvCell) => - row(Vector(csvCell)) + row(rowStart, Vector(csvCell)) case f @ Fail(_, _) => - (SkipRow(input.marked(pos)), f) + (SkipRow(rowStart, pos, input), f) case NeedInput => (instr, NeedInput) } } - case SkipRow(_) => + case SkipRow(rowStart, readFrom, _) => if (skipToNextRow()) { - (ParseRow(input.marked(pos)), Resume) + (ParseRow(pos, pos, input.marked(pos)), Resume) } else { - (SkipRow(input.marked(pos)), NeedInput) + (SkipRow(rowStart, pos, input), NeedInput) } } } diff --git a/framian/src/main/scala/framian/csv/ParserState.scala b/framian/src/main/scala/framian/csv/ParserState.scala index 583fe96..4b3449e 100644 --- a/framian/src/main/scala/framian/csv/ParserState.scala +++ b/framian/src/main/scala/framian/csv/ParserState.scala @@ -3,19 +3,21 @@ package framian.csv sealed trait ParserState { import ParserState._ + def rowStart: Long + def readFrom: Long def input: Input def withInput(input0: Input): ParserState = this match { - case ContinueRow(partial, _) => ContinueRow(partial, input0) - case SkipRow(_) => SkipRow(input0) - case ParseRow(_) => ParseRow(input0) + case ContinueRow(_, _, partial, _) => ContinueRow(rowStart, readFrom, partial, input0) + case SkipRow(_, _, _) => SkipRow(rowStart, readFrom, input0) + case ParseRow(_, _, _) => ParseRow(rowStart, readFrom, input0) } def mapInput(f: Input => Input): ParserState = withInput(f(input)) } object ParserState { - case class ContinueRow(partial: Vector[CsvCell], input: Input) extends ParserState - case class SkipRow(input: Input) extends ParserState - case class ParseRow(input: Input) extends ParserState + case class ContinueRow(rowStart: Long, readFrom: Long, partial: Vector[CsvCell], input: Input) extends ParserState + case class SkipRow(rowStart: Long, readFrom: Long, input: Input) extends ParserState + case class ParseRow(rowStart: Long, readFrom: Long, input: Input) extends ParserState } From fe49157f3813eeb169f13bd2879a3f5d0a18134a Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Fri, 25 Jul 2014 15:50:31 -0600 Subject: [PATCH 07/13] Optionally allow row-delimiters within quotes. --- .../main/scala/framian/csv/CsvFormat.scala | 23 +++++++++++++++++-- .../main/scala/framian/csv/CsvParser.scala | 2 +- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/framian/src/main/scala/framian/csv/CsvFormat.scala b/framian/src/main/scala/framian/csv/CsvFormat.scala index cf491d4..c2fda0e 100644 --- a/framian/src/main/scala/framian/csv/CsvFormat.scala +++ b/framian/src/main/scala/framian/csv/CsvFormat.scala @@ -39,13 +39,31 @@ trait GuessCsvFormat extends CsvFormatStrategy { } case class CsvFormat( + /** The delimiter that separates fields within the rows. */ separator: String, + + /** The character/string that indicates the beginning/end of a quoted value. */ quote: String = "\"", + + /** The string that is used to escape a quote character, within a quote. */ quoteEscape: String = "\"", + + /** The value of an empty field (common values are - or ?). */ empty: String = "", + + /** The value of an invalid field. Empty values take precedence, so setting + * this to the same value as `empty` essentially disabled invalid values. */ invalid: String = "", + + /** Indicates whether or not the CSV's first row is actually a header. */ header: Boolean = false, - rowDelim: CsvRowDelim = CsvRowDelim.Both + + /** The delimiter used to separate row. */ + rowDelim: CsvRowDelim = CsvRowDelim.Both, + + /** If true, allow row delimiters within quotes, otherwise they are treated + * as an error. */ + allowRowDelimInQuotes: Boolean = true ) extends CsvFormatStrategy { val escapedQuote = quoteEscape + quote @@ -85,7 +103,8 @@ object CsvFormat { empty: Option[String] = None, invalid: Option[String] = None, header: Option[Boolean] = None, - rowDelim: Option[CsvRowDelim] = None + rowDelim: Option[CsvRowDelim] = None, + allowRowDelimInQuotes: Boolean = true ) extends GuessCsvFormat { def withSeparator(separator: String): Partial = copy(separator = Some(separator)) diff --git a/framian/src/main/scala/framian/csv/CsvParser.scala b/framian/src/main/scala/framian/csv/CsvParser.scala index 1a33574..f355a2a 100644 --- a/framian/src/main/scala/framian/csv/CsvParser.scala +++ b/framian/src/main/scala/framian/csv/CsvParser.scala @@ -182,7 +182,7 @@ case class CsvParser(format: CsvFormat) { NeedInput } } else { - val d = isRowDelim() + val d = if (allowRowDelimInQuotes) 0 else isRowDelim() val e = isEscapedQuote() val q = isQuote() From a1b9ea96a489378e7a34e757265d2008c1182e0a Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Fri, 25 Jul 2014 15:58:46 -0600 Subject: [PATCH 08/13] Comments. --- .../main/scala/framian/csv/CsvFormat.scala | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/framian/src/main/scala/framian/csv/CsvFormat.scala b/framian/src/main/scala/framian/csv/CsvFormat.scala index c2fda0e..d83b72b 100644 --- a/framian/src/main/scala/framian/csv/CsvFormat.scala +++ b/framian/src/main/scala/framian/csv/CsvFormat.scala @@ -119,7 +119,23 @@ object CsvFormat { /** * Performs a very naive guess of the CsvFormat. This uses weighted * frequencies of occurences of common separators, row-delimiters, quotes, - * quote escapes, etc. and simply selects the max for each. + * quote escapes, etc. and simply selects the max for each. For empty + * values, it uses the frequency of the the possible empty values within + * the cells. + * + * This supports: + * + * * \r\n and \n as row delimiters, + * * ',', '\t', ';', and '|' as field delimiters, + * * '"', and ''' as quote delimiter, + * * the quote delimiter or \ for quote escapes, + * * '', '?', '-', 'N/A', and 'NA' as empty values, and + * * 'N/M' and 'NM' as invalid values. + * + * Headers are guessed by using the cosine similarity of the frequency of + * characters (except quotes/field delimiters) between the first row and + * all subsequent rows. Values below 0.5 will result in a header being + * inferred. */ def apply(str: String): CsvFormat = { def count(ndl: String): Int = { @@ -173,7 +189,7 @@ object CsvFormat { val header0 = header.getOrElse(hasHeader(str, rowDelim0.value, separator0, quote0)) - CsvFormat(separator0, quote0, quoteEscape0, empty0, invalid0, header0, rowDelim0) + CsvFormat(separator0, quote0, quoteEscape0, empty0, invalid0, header0, rowDelim0, allowRowDelimInQuotes) } def hasHeader(chunk: String, rowDelim: String, separator: String, quote: String): Boolean = { From 18ef00ff314a4ca91d9a24b6c9a517445635e363 Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Sat, 26 Jul 2014 09:41:55 -0600 Subject: [PATCH 09/13] Remove String => numeric cast. This was added to support converting data loaded from a CSV as Strings to numeric data. However, the new CSV parser just creates 2 columns, one for numeric data and one for textual data, then `orElse`s them together. --- framian/src/main/scala/framian/NumericColumnTyper.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/framian/src/main/scala/framian/NumericColumnTyper.scala b/framian/src/main/scala/framian/NumericColumnTyper.scala index 0d7d6a4..3bea73b 100644 --- a/framian/src/main/scala/framian/NumericColumnTyper.scala +++ b/framian/src/main/scala/framian/NumericColumnTyper.scala @@ -115,7 +115,6 @@ object NumericColumnTyper { if (x.isExact) rational(x.toRational) else bigFloat(x.toBigDecimal) } - case (x: String) => string(x) case _ => z } } @@ -142,7 +141,6 @@ object NumericColumnTyper { case Classes.BigDecimal => bigFloat(column.asInstanceOf[Column[BigDecimal]]) case Classes.JavaBigDecimal => bigFloat(column.asInstanceOf[Column[java.math.BigDecimal]] map (BigDecimal(_))) case cls if Classes.Rational isAssignableFrom cls => rational(column.asInstanceOf[Column[Rational]]) - case Classes.String => string(column.asInstanceOf[Column[String]]) case _ => z } } From 5f7b5f287177a312247a0a53880f5597350fc33b Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Sun, 27 Jul 2014 12:58:07 -0600 Subject: [PATCH 10/13] Add more tests. --- .../main/scala/framian/csv/CsvFormat.scala | 24 ++- .../src/test/resources/csvs/auto-mpg-test.tsv | 10 +- framian/src/test/scala/framian/CsvSpec.scala | 155 +++++++++--------- 3 files changed, 101 insertions(+), 88 deletions(-) diff --git a/framian/src/main/scala/framian/csv/CsvFormat.scala b/framian/src/main/scala/framian/csv/CsvFormat.scala index d83b72b..6bbc8b7 100644 --- a/framian/src/main/scala/framian/csv/CsvFormat.scala +++ b/framian/src/main/scala/framian/csv/CsvFormat.scala @@ -11,7 +11,16 @@ object CsvRowDelim { case object Both extends CsvRowDelim("\n", Some("\r\n")) } -sealed trait CsvFormatStrategy +sealed trait CsvFormatStrategy { + def withSeparator(separator: String): CsvFormatStrategy + def withQuote(quote: String): CsvFormatStrategy + def withQuoteEscape(quoteEscape: String): CsvFormatStrategy + def withEmpty(empty: String): CsvFormatStrategy + def withInvalid(invalid: String): CsvFormatStrategy + def withHeader(header: Boolean): CsvFormatStrategy + def withRowDelim(rowDelim: CsvRowDelim): CsvFormatStrategy + def withRowDelim(rowDelim: String): CsvFormatStrategy +} trait GuessCsvFormat extends CsvFormatStrategy { @@ -68,7 +77,7 @@ case class CsvFormat( val escapedQuote = quoteEscape + quote override def toString: String = - s"""CsvFormat(separator = "$separator", quote = "$quote", quoteEscape = "$quoteEscape", empty = "$empty", invalid = "$invalid", header = $header, rowDelim = $rowDelim)""" + s"""CsvFormat(separator = "$separator", quote = "$quote", quoteEscape = "$quoteEscape", empty = "$empty", invalid = "$invalid", header = $header, rowDelim = $rowDelim, allowRowDelimInQuotes = $allowRowDelimInQuotes)""" /** * Replaces all instances of \r\n with \n, then escapes all quotes and wraps @@ -88,6 +97,15 @@ case class CsvFormat( (text contains quote)) escape(text) else text } + + def withSeparator(separator: String): CsvFormat = copy(separator = separator) + def withQuote(quote: String): CsvFormat = copy(quote = quote) + def withQuoteEscape(quoteEscape: String): CsvFormat = copy(quoteEscape = quoteEscape) + def withEmpty(empty: String): CsvFormat = copy(empty = empty) + def withInvalid(invalid: String): CsvFormat = copy(invalid = invalid) + def withHeader(header: Boolean): CsvFormat = copy(header = header) + def withRowDelim(rowDelim: CsvRowDelim): CsvFormat = copy(rowDelim = rowDelim) + def withRowDelim(rowDelim: String): CsvFormat = copy(rowDelim = CsvRowDelim.Custom(rowDelim)) } object CsvFormat { @@ -167,7 +185,7 @@ object CsvFormat { else CsvRowDelim.Windows } val separator0 = separator.getOrElse { - choose("," -> 1.0, "\t" -> 3.0, ";" -> 2.0, "|" -> 3.0)(count) + choose("," -> 2.0, "\t" -> 3.0, ";" -> 2.0, "|" -> 1.0)(count) } val quote0 = quote.getOrElse(choose("\"" -> 1.2, "\'" -> 1)(count)) val quoteEscape0 = choose(s"$quote0$quote0" -> 1.1, s"\\$quote0" -> 1)(count).dropRight(quote0.length) diff --git a/framian/src/test/resources/csvs/auto-mpg-test.tsv b/framian/src/test/resources/csvs/auto-mpg-test.tsv index 5c6fa70..903c077 100644 --- a/framian/src/test/resources/csvs/auto-mpg-test.tsv +++ b/framian/src/test/resources/csvs/auto-mpg-test.tsv @@ -1,5 +1,5 @@ -18.0 8 307.0 130.0 3504. 12.0 70 1 "chevrolet chevelle malibu" -15.0 8 350.0 165.0 3693. 11.5 70 1 "buick skylark 320" -18.0 8 318.0 150.0 3436. 11.0 70 1 "plymouth satellite" -16.0 8 304.0 150.0 3433. 12.0 70 1 "amc rebel sst" -17.0 8 302.0 140.0 3449. 10.5 70 1 "ford torino" +18.0 8 307.0 130.0 3504. 12.0 70 1 "chevrolet chevelle malibu" +15.0 8 350.0 165.0 3693. 11.5 70 1 "buick skylark 320" +18.0 8 318.0 150.0 3436. 11.0 70 1 "plymouth satellite" +16.0 8 304.0 150.0 3433. 12.0 70 1 "amc rebel sst" +17.0 8 302.0 140.0 3449. 10.5 70 1 "ford torino" diff --git a/framian/src/test/scala/framian/CsvSpec.scala b/framian/src/test/scala/framian/CsvSpec.scala index b8d0dbb..4697a33 100644 --- a/framian/src/test/scala/framian/CsvSpec.scala +++ b/framian/src/test/scala/framian/CsvSpec.scala @@ -1,5 +1,5 @@ package framian -package utilities +package csv import org.specs2.mutable._ @@ -19,9 +19,9 @@ class CsvSpec extends Specification { val airPassengersBadComma = csvRoot +"AirPassengers-badcomma.csv" val autoMPG = csvRoot +"auto-mpg-test.tsv" - val defaultRowIndex = Index.fromKeys("0", "1", "2", "3", "4") - val withColumnRowIndex = Index.fromKeys("0", "1", "2", "3") - val defaultAPColumnIndex = Index.fromKeys("0", "1", "2") + val defaultRowIndex = Index.fromKeys(0, 1, 2, 3, 4) + val withColumnRowIndex = Index.fromKeys(0, 1, 2, 3) + val defaultAPColumnIndex = Index.fromKeys(0, 1, 2) val defaultAirPassengers = Frame.fromRows( "" :: "time" :: "AirPassengers" :: HNil, @@ -38,21 +38,6 @@ class CsvSpec extends Specification { "4" :: "1949.25" :: "129" :: HNil) .withColIndex(Index.fromKeys("", "time", "AirPassengers")) .withRowIndex(withColumnRowIndex) - val rowAirPassengers = (Frame.fromRows( - "time" :: "AirPassengers" :: HNil, - "1949" :: "112" :: HNil, - "1949.08333333333" :: "118" :: HNil, - "1949.16666666667" :: "132" :: HNil, - "1949.25" :: "129" :: HNil) - .withColIndex(Index.fromKeys("0", "1")) - .withRowIndex(Index.fromKeys("", "1", "2", "3", "4"))) - val correctAirPassengers = Frame.fromRows( - "1949" :: "112" :: HNil, - "1949.08333333333" :: "118" :: HNil, - "1949.16666666667" :: "132" :: HNil, - "1949.25" :: "129" :: HNil) - .withColIndex(Index.fromKeys("time", "AirPassengers")) - .withRowIndex(Index.fromKeys("1", "2", "3", "4")) val defaultMPG = Frame.fromRows( "18.0" :: "8" :: "307.0" :: "130.0" :: "3504." :: "12.0" :: "70" :: "1" :: "chevrolet chevelle malibu" :: HNil, @@ -61,86 +46,96 @@ class CsvSpec extends Specification { "16.0" :: "8" :: "304.0" :: "150.0" :: "3433." :: "12.0" :: "70" :: "1" :: "amc rebel sst" :: HNil, "17.0" :: "8" :: "302.0" :: "140.0" :: "3449." :: "10.5" :: "70" :: "1" :: "ford torino" :: HNil) .withRowIndex(defaultRowIndex) - .withColIndex(Index.fromKeys("0", "1", "2", "3", "4", "5", "6", "7", "8")) - val withRowIndexMPG = Frame.fromRows( - "18.0" :: "8" :: "307.0" :: "130.0" :: "3504." :: "12.0" :: "70" :: "1" :: HNil, - "15.0" :: "8" :: "350.0" :: "165.0" :: "3693." :: "11.5" :: "70" :: "1" :: HNil, - "18.0" :: "8" :: "318.0" :: "150.0" :: "3436." :: "11.0" :: "70" :: "1" :: HNil, - "16.0" :: "8" :: "304.0" :: "150.0" :: "3433." :: "12.0" :: "70" :: "1" :: HNil, - "17.0" :: "8" :: "302.0" :: "140.0" :: "3449." :: "10.5" :: "70" :: "1" :: HNil) - .withRowIndex(Index.fromKeys( - "chevrolet chevelle malibu", "buick skylark 320", - "plymouth satellite", "amc rebel sst", "ford torino")) - .withColIndex(Index.fromKeys("0", "1", "2", "3", "4", "5", "6", "7")) - val customColsMPG = Frame.fromRows( - "18.0" :: "8" :: "307.0" :: HNil, - "15.0" :: "8" :: "350.0" :: HNil, - "18.0" :: "8" :: "318.0" :: HNil, - "16.0" :: "8" :: "304.0" :: HNil, - "17.0" :: "8" :: "302.0" :: HNil) - .withRowIndex(defaultRowIndex) - .withColIndex(Index.fromKeys("0", "1", "2")) - val withRowIndexCustomColsMPG = Frame.fromRows( - "18.0" :: "8" :: "307.0" :: HNil, - "15.0" :: "8" :: "350.0" :: HNil, - "18.0" :: "8" :: "318.0" :: HNil, - "16.0" :: "8" :: "304.0" :: HNil, - "17.0" :: "8" :: "302.0" :: HNil) - .withRowIndex(Index.fromKeys( - "chevrolet chevelle malibu", "buick skylark 320", - "plymouth satellite", "amc rebel sst", "ford torino")) - .withColIndex(Index.fromKeys("0", "1", "2")) - - val apBadComma = Frame.fromRows( - "" :: "FlightName" :: "AirPassengers" :: HNil, - "1" :: "ABCD111" :: "112" :: HNil, - "2" :: "Delta20394" :: "118" :: HNil, - "3" :: "FLIGHTTOHELL, REALLY" :: "132" :: HNil, - "4" :: "United666" :: "129" :: HNil) - .withColIndex(defaultAPColumnIndex) - .withRowIndex(defaultRowIndex) + .withColIndex(Index.fromKeys(0, 1, 2, 3, 4, 5, 6, 7, 8)) + + "CsvParser" should { + "parse air passengers as unlabeled CSV" in { + Csv.parsePath(airPassengers).unlabeled.toFrame must_== defaultAirPassengers + } - def getFile(loc:String) = new File(loc) + "parse air passengers as labeled CSV" in { + Csv.parsePath(airPassengers).labeled.toFrame must_== columnAirPassengers + } + + "parse autoMPG as unlabeled TSV" in { + Csv.parsePath(autoMPG).unlabeled.toFrame must_== defaultMPG + } - "Csv parser" should { - "parse air passengers with default settings" in { - loadFrameFromCSV(getFile(airPassengers)) must_== defaultAirPassengers - } + "parse CSV with separator in quote" in { + val data = """a,"b","c,d"|"e,f,g"""" + val csv = Csv.parseString(data, CsvFormat.Guess.withRowDelim("|")) + val frame = csv.unlabeled.toFrame + frame.getRow(0) must_== Some(Rec(0 -> "a", 1 -> "b", 2 -> "c,d")) + frame[String](1, 0) must_== Value("e,f,g") + frame[String](1, 1) must_== NA + frame[String](1, 2) must_== NA + } - "parse air passengers with just column headers" in { - loadFrameFromCSV(getFile(airPassengers), columnIndex = true) must_== columnAirPassengers + import CsvCell._ + + val TestFormat = CsvFormat( + separator = ",", + quote = "'", + quoteEscape = "'", + empty = "N/A", + invalid = "N/M", + header = false, + rowDelim = CsvRowDelim.Custom("|"), + allowRowDelimInQuotes = true + ) + + "parse escaped quotes" in { + Csv.parseString( + "a,'''','c'''|'''''d''''', ''''", + TestFormat + ).rows must_== Vector( + Right(CsvRow(Vector(Data("a"), Data("'"), Data("c'")))), + Right(CsvRow(Vector(Data("''d''"), Data(" ''''")))) + ) } - "parse air passengers with just row headers" in { - loadFrameFromCSV(getFile(airPassengers), columnIndex = false, rowIndex = 0) must_== rowAirPassengers + "respect CsvFormat separator" in { + Csv.parseString("a,b,c|d,e,f", TestFormat).rows must_== + Csv.parseString("a;b;c|d;e;f", TestFormat.withSeparator(";")).rows } - "parse air passengers with row and column headers" in { - loadFrameFromCSV(getFile(airPassengers), columnIndex = true, rowIndex = 0) must_== correctAirPassengers + "respect CsvFormat quote" in { + Csv.parseString("'a,b','b'|d,e", TestFormat).rows must_== + Csv.parseString("^a,b^,^b^|d,e", TestFormat.withQuote("^")).rows } - "fail to parse autoMPG with default settings" in { - loadFrameFromCSV(getFile(autoMPG)) must throwA[AssertionError] + "respect CsvFormat quote escape" in { + Csv.parseString("'a''b',''''|' '", TestFormat).rows must_== + Csv.parseString("'a\\'b','\\''|' '", TestFormat.withQuoteEscape("\\")).rows } - "parse autoMPG with delimiter as tab but otherwise default settings" in { - loadFrameFromCSV(getFile(autoMPG), delimiter = "\t") must_== defaultMPG + "respect CsvFormat empty" in { + Csv.parseString("a,N/A,b|N/A,N/A", TestFormat).rows must_== + Csv.parseString("a,,b|,", TestFormat.withEmpty("")).rows } - "parse autoMPG with delimiter as tab and row index as column 8" in { - loadFrameFromCSV(getFile(autoMPG), delimiter = "\t", rowIndex = 8) must_== withRowIndexMPG + "respect CsvFormat invalid" in { + Csv.parseString("a,N/M,b|N/M,N/M", TestFormat).rows must_== + Csv.parseString("a,nm,b|nm,nm", TestFormat.withInvalid("nm")).rows } - "parse autoMPG with delimiter as tab and just take first three columns" in { - loadFrameFromCSV(getFile(autoMPG), delimiter = "\t", columns = List(0, 1, 2)) must_== customColsMPG + "respect CsvFormat row delimiter" in { + Csv.parseString("a,b|c,d|e,f", TestFormat).rows must_== + Csv.parseString("a,b\nc,d\ne,f", TestFormat.withRowDelim(CsvRowDelim.Unix)).rows } - "parse autoMPG with delimiter as tab, column 8 as row index and just take first three columns" in { - loadFrameFromCSV(getFile(autoMPG), delimiter = "\t", rowIndex = 8, columns = List(0, 1, 2)) must_== withRowIndexCustomColsMPG + "parse CSV with row delimiter in quote" in { + Csv.parseString("a,'b|c'|'d|e',f", TestFormat).rows must_== Vector( + Right(CsvRow(Vector(Data("a"), Data("b|c")))), + Right(CsvRow(Vector(Data("d|e"), Data("f"))))) } - "parse a file that has the delimiter within a column within the chosen quote type" in { - loadFrameFromCSV(getFile(airPassengersBadComma)) must_== apBadComma + "parser respects whitespace" in { + val data = " a , , 'a','b'| b ,c , " + val csv = Csv.parseString(data, CsvFormat.Guess.withRowDelim("|")) + csv.rows must_== Vector( + Right(CsvRow(Vector(Data(" a "), Data(" "), Data(" 'a'"), Data("b")))), + Right(CsvRow(Vector(Data(" b "), Data("c "), Data(" "))))) } } } From 0125ba11f10163643d59aabef83100da9b5e7b34 Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Mon, 27 Oct 2014 16:03:23 -0400 Subject: [PATCH 11/13] Clean up after merge. --- framian/src/main/scala/framian/csv/Csv.scala | 2 +- framian/src/main/scala/framian/csv/CsvCell.scala | 2 +- framian/src/main/scala/framian/csv/CsvLoader.scala | 2 -- framian/src/main/scala/framian/csv/CsvRow.scala | 4 +++- 4 files changed, 5 insertions(+), 5 deletions(-) delete mode 100644 framian/src/main/scala/framian/csv/CsvLoader.scala diff --git a/framian/src/main/scala/framian/csv/Csv.scala b/framian/src/main/scala/framian/csv/Csv.scala index fdd4fc7..5dbb7fc 100644 --- a/framian/src/main/scala/framian/csv/Csv.scala +++ b/framian/src/main/scala/framian/csv/Csv.scala @@ -87,7 +87,7 @@ object Csv { col -> (TypedColumn(numCol.result()) orElse TypedColumn(strCol.result())) }) - Frame.fromColumns( + ColOrientedFrame( Index(Array.range(0, rows.size)), Index(Array.range(0, cols.size)), columns diff --git a/framian/src/main/scala/framian/csv/CsvCell.scala b/framian/src/main/scala/framian/csv/CsvCell.scala index 50559c4..4021930 100644 --- a/framian/src/main/scala/framian/csv/CsvCell.scala +++ b/framian/src/main/scala/framian/csv/CsvCell.scala @@ -26,7 +26,7 @@ object CsvCell { case NM => Invalid } - implicit object CsvCellColumnTyper extends ColumnTyper[CsvCell] { + implicit val CsvCellColumnTyper: ColumnTyper[CsvCell] = new ColumnTyper[CsvCell] { def cast(col: TypedColumn[_]): Column[CsvCell] = { val num = col.cast[BigDecimal] map (n => Data(n.toString): CsvCell) val text = col.cast[String] map (Data(_): CsvCell) diff --git a/framian/src/main/scala/framian/csv/CsvLoader.scala b/framian/src/main/scala/framian/csv/CsvLoader.scala deleted file mode 100644 index af90e6f..0000000 --- a/framian/src/main/scala/framian/csv/CsvLoader.scala +++ /dev/null @@ -1,2 +0,0 @@ -package framian.csv - diff --git a/framian/src/main/scala/framian/csv/CsvRow.scala b/framian/src/main/scala/framian/csv/CsvRow.scala index ecee77d..54c66ad 100644 --- a/framian/src/main/scala/framian/csv/CsvRow.scala +++ b/framian/src/main/scala/framian/csv/CsvRow.scala @@ -17,8 +17,10 @@ final class CsvRow(val cells: Vector[CsvCell]) extends AnyVal { object CsvRow extends (Vector[CsvCell] => CsvRow) { def apply(cells: Vector[CsvCell]): CsvRow = new CsvRow(cells) - implicit def csvRowExtractor[Col]: RowExtractor[CsvRow, Col, Variable] = + implicit def csvRowExtractor[Col]: RowExtractor[CsvRow, Col, Variable] = { + import CsvCell.CsvCellColumnTyper // TODO: WHY IS THIS NEEDED! ARGH! RowExtractor.collectionOf[Vector, CsvCell, Col].map { cells => CsvRow(cells.map(_.fold[CsvCell](CsvCell.Empty, CsvCell.Invalid)(cell => cell))) } + } } From 901639feeebe14c9d1bed1d80408708493c4dcc1 Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Mon, 27 Oct 2014 16:22:23 -0400 Subject: [PATCH 12/13] Remove obsolete loadFrameFromCSV util method. --- .../src/main/scala/framian/FrameUtils.scala | 175 ------------------ 1 file changed, 175 deletions(-) delete mode 100644 framian/src/main/scala/framian/FrameUtils.scala diff --git a/framian/src/main/scala/framian/FrameUtils.scala b/framian/src/main/scala/framian/FrameUtils.scala deleted file mode 100644 index b0e551b..0000000 --- a/framian/src/main/scala/framian/FrameUtils.scala +++ /dev/null @@ -1,175 +0,0 @@ -/* _____ _ - * | ___| __ __ _ _ __ ___ (_) __ _ _ __ - * | |_ | '__/ _` | '_ ` _ \| |/ _` | '_ \ - * | _|| | | (_| | | | | | | | (_| | | | | - * |_| |_| \__,_|_| |_| |_|_|\__,_|_| |_| - * - * Copyright 2014 Pellucid Analytics - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package framian - -import java.util.Scanner -import java.io._ - -import scala.collection.mutable.ArrayBuffer - -import scala.io.Source -import spire.math.Number -import spire.implicits._ - -package object utilities { - - /** Load a frame from CSV. - * - * @param delimiter how values in csv are delimited (tabs, commas, etc). defaults, per name of method, to comma. - * @param columnIndex whether or not the first row is expected to indicate the column index of the frame - * @param rowIndex whether or not the first column is expected to express the row index of the frame - */ - def loadFrameFromCSV( - csvFile: File, - delimiter: String = ",", - quote: String = "\"", - rowIndex: Int = -1, - columnIndex: Boolean = false, - columns: List[Int] = List() - ) = { - // sometimes you just want the bulk of a state-machine compiled for you... - val stripWhitespaceCheckQuoteState = - s"(?:[\\s]*($quote[^$quote]*$quote|[^\\s$quote]*(?:[\\s]*[^\\s$quote]*)*)[\\s]*)|[\\s]*($quote[^$quote]*)".r - val checkQuoteFinished = s"([^$quote]*)|([^$quote]*$quote)[\\s]*".r - - val stripQuotes = s"$quote?([^$quote]*)$quote?".r - - val file = new BufferedReader(new FileReader(csvFile)) - - var nextLine = file.readLine() - - // for now we're just making sure that the delimiter produces more than one column... - // otherwise, assuming misconfigured and not even trying to parse the file (ie comma on tsv) - assert(nextLine.split(delimiter).length > 1) - - // assuming sorted list of desired columns... - def parseLine(line: String, columns: List[Int] = List()): ArrayBuffer[String] = { - val lineScanner = new Scanner(line).useDelimiter(delimiter) - val results = ArrayBuffer[String]() - val columnsNotProvided = columns.isEmpty - - var inQuote = false - var position = 0 - var quoteBuilder: StringBuilder = null - var remainingColumns = columns - - def takeColumnIfRequested(value: String) = { - if (columnsNotProvided || remainingColumns.head == position) { - if (!columnsNotProvided) remainingColumns = remainingColumns.tail - val stripQuotes(cleanedValue) = value - results += cleanedValue - } - position += 1 - } - - while (lineScanner.hasNext && (columnsNotProvided || !remainingColumns.isEmpty)) { - val next = lineScanner.next() - if (inQuote) { - val checkQuoteFinished(middle, endOfQuote) = next - // either we're in the middle of a quote in which case add the middle - // to builder and move to the next segment of the lineScanner - if (middle != null) - quoteBuilder ++= delimiter + middle - // or we're at the end and need to add final value to quote builder and take column if needed - else { - quoteBuilder ++= delimiter + endOfQuote - takeColumnIfRequested(quoteBuilder.result) - inQuote = false - } - } else { - val stripWhitespaceCheckQuoteState(completeValue, quoteBeginning) = next - if (completeValue != null) - takeColumnIfRequested(completeValue) - else { - quoteBuilder = new StringBuilder(quoteBeginning) - inQuote = true - } - } - } - - results - } - - // if you want a row index, you don't have to explicitly specify the first column in columnsToParse - val columnsToParse = - if (rowIndex < 0) - columns - else { - if (!columns.isEmpty && !columns.contains(rowIndex)) - (rowIndex :: columns).sorted - else - columns - } - - // first line might be the column index and not real values, also want to instantiate column cache - val firstLine = parseLine(nextLine, columnsToParse) - val numberOfColumns = firstLine.length - // we either want to pull out that first row as the column index or produce a default integer index - val columnsSeq = 0 to (numberOfColumns - 1) map (_.toString) - val parsedColumns = ArrayBuffer[ArrayBuffer[String]](columnsSeq map { _ => ArrayBuffer[String]() }: _*) - - // need to make sure that we parse the first line if it isn't a column index. - val colIndexArray = - if (columnIndex) { - nextLine = file.readLine() - firstLine - } else - ArrayBuffer(columnsSeq: _*) - - var index = 0 - while (nextLine != null) { - val parsed = parseLine(nextLine, columnsToParse) - while (index < numberOfColumns) { - parsedColumns(index) += parsed(index) - index += 1 - } - - index = 0 - nextLine = file.readLine() - } - file.close() - - // either make a row index now that we know how many rows or grab the user specified row index column - // also, if there's a row index we need to drop first value in column index as un-needed. - val (rowIndexValues, valueColumns, valueColumnIndex) = - if (rowIndex < 0) - ((0 to parsedColumns(0).length - 1) map (_.toString), parsedColumns, colIndexArray) - else { - val rowIndexPosition = if (columnsToParse.isEmpty) rowIndex else columnsToParse.indexOf(rowIndex) - val (colIndexLeft, colIndexRight) = colIndexArray.splitAt(rowIndexPosition) - val (colsLeft, colsRight) = parsedColumns.splitAt(rowIndexPosition) - (parsedColumns(rowIndexPosition), - colsLeft ++ colsRight.drop(1), - // if column index, make sure row index name not a part of it - if (columnIndex) colIndexLeft ++ colIndexRight.drop(1) - // otherwise, make sure we get one fewer column index value than expected - else colIndexArray.slice(0, parsedColumns.length - 1)) - } - - ColOrientedFrame( - Index(rowIndexValues.toArray), - Series(valueColumnIndex.zip( - valueColumns map { - colArr => TypedColumn(Column.fromArray(colArr.toArray)) - }): _*)) - } -} From 6d86b792669f5344b795aceb3d1ce16827b65482 Mon Sep 17 00:00:00 2001 From: Tom Switzer Date: Wed, 29 Oct 2014 15:30:32 -0400 Subject: [PATCH 13/13] Fix bug in tests that only show up in 2.10. --- framian/src/test/scala/framian/CsvSpec.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/framian/src/test/scala/framian/CsvSpec.scala b/framian/src/test/scala/framian/CsvSpec.scala index 56df661..b8db6b9 100644 --- a/framian/src/test/scala/framian/CsvSpec.scala +++ b/framian/src/test/scala/framian/CsvSpec.scala @@ -54,10 +54,10 @@ class CsvSpec extends Specification { Value("AirPassengers") ))))) val columnAirPassengers = Frame.fromRows( - 1 :: 1949 :: 112 :: HNil, - 2 :: 1949.08333333333 :: 118 :: HNil, - 3 :: 1949.16666666667 :: 132 :: HNil, - 4 :: 1949.25 :: 129 :: HNil) + 1 :: BigDecimal(1949) :: 112 :: HNil, + 2 :: BigDecimal(1949.08333333333) :: 118 :: HNil, + 3 :: BigDecimal(1949.16666666667) :: 132 :: HNil, + 4 :: BigDecimal(1949.25) :: 129 :: HNil) .withColIndex(Index.fromKeys("", "time", "AirPassengers")) .withRowIndex(withColumnRowIndex)