diff --git a/framian/src/main/scala/framian/FrameUtils.scala b/framian/src/main/scala/framian/FrameUtils.scala deleted file mode 100644 index 59f288c..0000000 --- a/framian/src/main/scala/framian/FrameUtils.scala +++ /dev/null @@ -1,177 +0,0 @@ -/* _____ _ - * | ___| __ __ _ _ __ ___ (_) __ _ _ __ - * | |_ | '__/ _` | '_ ` _ \| |/ _` | '_ \ - * | _|| | | (_| | | | | | | | (_| | | | | - * |_| |_| \__,_|_| |_| |_|_|\__,_|_| |_| - * - * Copyright 2014 Pellucid Analytics - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package framian - -import java.util.Scanner -import java.io._ - -import scala.collection.mutable.ArrayBuffer - -import scala.io.Source -import spire.math.Number -import spire.implicits._ - -import framian.column._ - -package object utilities { - - /** Load a frame from CSV. - * - * @param delimiter how values in csv are delimited (tabs, commas, etc). defaults, per name of method, to comma. - * @param columnIndex whether or not the first row is expected to indicate the column index of the frame - * @param rowIndex whether or not the first column is expected to express the row index of the frame - */ - def loadFrameFromCSV( - csvFile: File, - delimiter: String = ",", - quote: String = "\"", - rowIndex: Int = -1, - columnIndex: Boolean = false, - columns: List[Int] = List() - ) = { - // sometimes you just want the bulk of a state-machine compiled for you... - val stripWhitespaceCheckQuoteState = - s"(?:[\\s]*($quote[^$quote]*$quote|[^\\s$quote]*(?:[\\s]*[^\\s$quote]*)*)[\\s]*)|[\\s]*($quote[^$quote]*)".r - val checkQuoteFinished = s"([^$quote]*)|([^$quote]*$quote)[\\s]*".r - - val stripQuotes = s"$quote?([^$quote]*)$quote?".r - - val file = new BufferedReader(new FileReader(csvFile)) - - var nextLine = file.readLine() - - // for now we're just making sure that the delimiter produces more than one column... - // otherwise, assuming misconfigured and not even trying to parse the file (ie comma on tsv) - assert(nextLine.split(delimiter).length > 1) - - // assuming sorted list of desired columns... - def parseLine(line: String, columns: List[Int] = List()): ArrayBuffer[String] = { - val lineScanner = new Scanner(line).useDelimiter(delimiter) - val results = ArrayBuffer[String]() - val columnsNotProvided = columns.isEmpty - - var inQuote = false - var position = 0 - var quoteBuilder: StringBuilder = null - var remainingColumns = columns - - def takeColumnIfRequested(value: String) = { - if (columnsNotProvided || remainingColumns.head == position) { - if (!columnsNotProvided) remainingColumns = remainingColumns.tail - val stripQuotes(cleanedValue) = value - results += cleanedValue - } - position += 1 - } - - while (lineScanner.hasNext && (columnsNotProvided || !remainingColumns.isEmpty)) { - val next = lineScanner.next() - if (inQuote) { - val checkQuoteFinished(middle, endOfQuote) = next - // either we're in the middle of a quote in which case add the middle - // to builder and move to the next segment of the lineScanner - if (middle != null) - quoteBuilder ++= delimiter + middle - // or we're at the end and need to add final value to quote builder and take column if needed - else { - quoteBuilder ++= delimiter + endOfQuote - takeColumnIfRequested(quoteBuilder.result) - inQuote = false - } - } else { - val stripWhitespaceCheckQuoteState(completeValue, quoteBeginning) = next - if (completeValue != null) - takeColumnIfRequested(completeValue) - else { - quoteBuilder = new StringBuilder(quoteBeginning) - inQuote = true - } - } - } - - results - } - - // if you want a row index, you don't have to explicitly specify the first column in columnsToParse - val columnsToParse = - if (rowIndex < 0) - columns - else { - if (!columns.isEmpty && !columns.contains(rowIndex)) - (rowIndex :: columns).sorted - else - columns - } - - // first line might be the column index and not real values, also want to instantiate column cache - val firstLine = parseLine(nextLine, columnsToParse) - val numberOfColumns = firstLine.length - // we either want to pull out that first row as the column index or produce a default integer index - val columnsSeq = 0 to (numberOfColumns - 1) map (_.toString) - val parsedColumns = ArrayBuffer[ArrayBuffer[String]](columnsSeq map { _ => ArrayBuffer[String]() }: _*) - - // need to make sure that we parse the first line if it isn't a column index. - val colIndexArray = - if (columnIndex) { - nextLine = file.readLine() - firstLine - } else - ArrayBuffer(columnsSeq: _*) - - var index = 0 - while (nextLine != null) { - val parsed = parseLine(nextLine, columnsToParse) - while (index < numberOfColumns) { - parsedColumns(index) += parsed(index) - index += 1 - } - - index = 0 - nextLine = file.readLine() - } - file.close() - - // either make a row index now that we know how many rows or grab the user specified row index column - // also, if there's a row index we need to drop first value in column index as un-needed. - val (rowIndexValues, valueColumns, valueColumnIndex) = - if (rowIndex < 0) - ((0 to parsedColumns(0).length - 1) map (_.toString), parsedColumns, colIndexArray) - else { - val rowIndexPosition = if (columnsToParse.isEmpty) rowIndex else columnsToParse.indexOf(rowIndex) - val (colIndexLeft, colIndexRight) = colIndexArray.splitAt(rowIndexPosition) - val (colsLeft, colsRight) = parsedColumns.splitAt(rowIndexPosition) - (parsedColumns(rowIndexPosition), - colsLeft ++ colsRight.drop(1), - // if column index, make sure row index name not a part of it - if (columnIndex) colIndexLeft ++ colIndexRight.drop(1) - // otherwise, make sure we get one fewer column index value than expected - else colIndexArray.slice(0, parsedColumns.length - 1)) - } - - ColOrientedFrame( - Index(rowIndexValues.toArray), - Series(valueColumnIndex.zip( - valueColumns map { - colArr => TypedColumn(Column.dense(colArr.toArray)) - }): _*)) - } -} diff --git a/framian/src/main/scala/framian/NumericColumnTyper.scala b/framian/src/main/scala/framian/NumericColumnTyper.scala index 1e95d42..d0b0ad9 100644 --- a/framian/src/main/scala/framian/NumericColumnTyper.scala +++ b/framian/src/main/scala/framian/NumericColumnTyper.scala @@ -108,7 +108,6 @@ object NumericColumnTyper { if (x.isExact) rational(x.toRational) else bigFloat(x.toBigDecimal) } - case (x: String) => string(x) case _ => z } } @@ -135,7 +134,6 @@ object NumericColumnTyper { case Classes.BigDecimal => bigFloat(column.asInstanceOf[Column[BigDecimal]]) case Classes.JavaBigDecimal => bigFloat(column.asInstanceOf[Column[java.math.BigDecimal]] map (BigDecimal(_))) case cls if Classes.Rational isAssignableFrom cls => rational(column.asInstanceOf[Column[Rational]]) - case Classes.String => string(column.asInstanceOf[Column[String]]) case _ => z } } diff --git a/framian/src/main/scala/framian/UntypedColumn.scala b/framian/src/main/scala/framian/UntypedColumn.scala index 40fe34c..26290b8 100644 --- a/framian/src/main/scala/framian/UntypedColumn.scala +++ b/framian/src/main/scala/framian/UntypedColumn.scala @@ -38,17 +38,18 @@ import framian.column._ */ trait UntypedColumn extends ColumnLike[UntypedColumn] { def cast[A: ColumnTyper]: Column[A] - def orElse(that: UntypedColumn): UntypedColumn = MergedUntypedColumn(this, that) + def orElse(that: UntypedColumn): UntypedColumn = (this, that) match { + case (EmptyUntypedColumn, _) => that + case (_, EmptyUntypedColumn) => this + case _ => MergedUntypedColumn(this, that) + } } object UntypedColumn { implicit object monoid extends Monoid[UntypedColumn] { def id: UntypedColumn = empty - def op(lhs: UntypedColumn, rhs: UntypedColumn): UntypedColumn = (lhs, rhs) match { - case (EmptyUntypedColumn, _) => rhs - case (_, EmptyUntypedColumn) => lhs - case _ => MergedUntypedColumn(lhs, rhs) - } + def op(lhs: UntypedColumn, rhs: UntypedColumn): UntypedColumn = + lhs orElse rhs } final def empty: UntypedColumn = EmptyUntypedColumn diff --git a/framian/src/main/scala/framian/csv/Csv.scala b/framian/src/main/scala/framian/csv/Csv.scala new file mode 100644 index 0000000..1952b3c --- /dev/null +++ b/framian/src/main/scala/framian/csv/Csv.scala @@ -0,0 +1,136 @@ +package framian +package csv + +import spire.std.int._ +import spire.std.string._ + +import framian.column.ColumnBuilder + +sealed abstract class Csv { + val format: CsvFormat + val rows: Vector[Either[CsvError, CsvRow]] + + lazy val data: Vector[CsvRow] = + rows.collect { case Right(row) => row } + lazy val errors: Vector[CsvError] = + rows.collect { case Left(error) => error } + def hasErrors: Boolean = !errors.isEmpty + + def unlabeled: UnlabeledCsv = this match { + case csv @ UnlabeledCsv(_, _) => + csv + case LabeledCsv(format, _, rows) => + UnlabeledCsv(format.copy(header = false), rows) + } + + def labeled: LabeledCsv = this match { + case csv @ LabeledCsv(_, _, _) => + csv + case UnlabeledCsv(format, rows) => + val format0 = format.copy(header = true) + rows.headOption.flatMap(_.right.toOption).map { hdr => + LabeledCsv(format0, hdr.text(format), rows.tail) + }.getOrElse { + LabeledCsv(format0, Vector.empty, Vector.empty) + } + } + + override def toString: String = { + val full = this match { + case LabeledCsv(_, header, _) => + CsvRow(header map (CsvCell.Data(_))) +: data + case UnlabeledCsv(_, _) => + data + } + + full.iterator. + map(_ render format). + mkString(format.rowDelim.value) + } +} + +case class LabeledCsv(format: CsvFormat, header: Vector[String], rows: Vector[Either[CsvError, CsvRow]]) extends Csv { + def toFrame: Frame[Int, String] = + Csv.toFrame(data).withColIndex(Index.fromKeys(header: _*)) +} + +case class UnlabeledCsv(format: CsvFormat, rows: Vector[Either[CsvError, CsvRow]]) extends Csv { + def toFrame: Frame[Int, Int] = + Csv.toFrame(data) +} + +object Csv { + val BufferSize = 32 * 1024 + + def empty(format: CsvFormat): Csv = + if (format.header) LabeledCsv(format, Vector.empty, Vector.empty) + else UnlabeledCsv(format, Vector.empty) + + private[csv] def toFrame(rows: Vector[CsvRow]): Frame[Int, Int] = { + val cols = rows.foldLeft(Map.empty[Int,(ColumnBuilder[BigDecimal],ColumnBuilder[String])]) { (acc0, row) => + row.cells.zipWithIndex.foldLeft(acc0) { case (acc, (cell, colIdx)) => + val (numCol, strCol) = acc.getOrElse(colIdx, (Column.newBuilder[BigDecimal](), Column.newBuilder[String]())) + cell match { + case CsvCell.Data(value) => + numCol += scala.util.Try(BigDecimal(value)).map(Value(_)).getOrElse(NA) + strCol.addValue(value) + case CsvCell.Empty => + numCol.addNA() + strCol.addNA() + case CsvCell.Invalid => + numCol.addNM() + strCol.addNM() + } + acc + (colIdx -> (numCol, strCol)) + } + } + + val columns = Column.eval(cols.map { case (col, (numCol, strCol)) => + col -> Value(TypedColumn(numCol.result()) orElse TypedColumn(strCol.result())) + }) + + ColOrientedFrame( + Index(Array.range(0, rows.size)), + Index(Array.range(0, cols.size)), + columns + ) + } + + def fromFrame[Col](format: CsvFormat)(frame: Frame[_, Col]): Csv = { + val rows = frame.get(Cols.all[Col].as[CsvRow]).denseIterator.map { + case (_, row) => Right(row) + }.toVector + + if (format.header) { + val header = frame.colIndex.toVector.map(_._1.toString) + LabeledCsv(format, header, rows) + } else { + UnlabeledCsv(format, rows) + } + } + + import java.nio.charset.{ Charset, StandardCharsets } + import java.io.File + import java.io.{ InputStream, FileInputStream } + import java.io.{ Reader, InputStreamReader, StringReader } + + def parseReader(reader: Reader, format: CsvFormatStrategy = CsvFormat.Guess): Csv = { + val (format0, reader0) = format match { + case (guess: GuessCsvFormat) => guess(reader) + case (fmt: CsvFormat) => (fmt, reader) + } + CsvParser(format0).parseReader(reader0) + } + + def parseString(input: String, format: CsvFormatStrategy = CsvFormat.Guess): Csv = + parseReader(new StringReader(input), format) + + def parseInputStream(is: InputStream, format: CsvFormatStrategy = CsvFormat.Guess, charset: Charset = StandardCharsets.UTF_8): Csv = + parseReader(new InputStreamReader(is, charset), format) + + def parseFile(file: File, format: CsvFormatStrategy = CsvFormat.Guess, charset: Charset = StandardCharsets.UTF_8): Csv = + parseInputStream(new FileInputStream(file), format, charset) + + def parsePath(filename: String, format: CsvFormatStrategy = CsvFormat.Guess, charset: Charset = StandardCharsets.UTF_8): Csv = + parseFile(new File(filename), format, charset) +} diff --git a/framian/src/main/scala/framian/csv/CsvCell.scala b/framian/src/main/scala/framian/csv/CsvCell.scala new file mode 100644 index 0000000..4021930 --- /dev/null +++ b/framian/src/main/scala/framian/csv/CsvCell.scala @@ -0,0 +1,37 @@ +package framian +package csv + +import spire.syntax.monoid._ + +sealed abstract class CsvCell { + def render(format: CsvFormat): String +} + +object CsvCell { + case class Data(value: String) extends CsvCell { + def render(format: CsvFormat): String = format.render(value) + override def toString: String = value + } + case object Empty extends CsvCell { + def render(format: CsvFormat): String = format.empty + override def toString: String = "-" + } + case object Invalid extends CsvCell { + def render(format: CsvFormat): String = format.invalid + override def toString: String = "" + } + + def fromNonValue(nonValue: NonValue): CsvCell = nonValue match { + case NA => Empty + case NM => Invalid + } + + implicit val CsvCellColumnTyper: ColumnTyper[CsvCell] = new ColumnTyper[CsvCell] { + def cast(col: TypedColumn[_]): Column[CsvCell] = { + val num = col.cast[BigDecimal] map (n => Data(n.toString): CsvCell) + val text = col.cast[String] map (Data(_): CsvCell) + val any = col.cast[Any] map (any => Data(any.toString): CsvCell) + num |+| text |+| any + } + } +} diff --git a/framian/src/main/scala/framian/csv/CsvError.scala b/framian/src/main/scala/framian/csv/CsvError.scala new file mode 100644 index 0000000..9333029 --- /dev/null +++ b/framian/src/main/scala/framian/csv/CsvError.scala @@ -0,0 +1,12 @@ +package framian.csv + +case class CsvError(message: String, rowStart: Long, pos: Long, context: String, row: Long, col: Long) { + def description: String = { + val msg = s"Error parsing CSV row: $message" + val prefix = s"Row $row: " + val padLength = col.toInt - 1 + prefix.length + val pointer = (" " * padLength) + "^" + + s"$msg\n\n$prefix$context\n$pointer" + } +} diff --git a/framian/src/main/scala/framian/csv/CsvFormat.scala b/framian/src/main/scala/framian/csv/CsvFormat.scala new file mode 100644 index 0000000..6bbc8b7 --- /dev/null +++ b/framian/src/main/scala/framian/csv/CsvFormat.scala @@ -0,0 +1,233 @@ +package framian.csv + +import java.io.{ Reader, PushbackReader } +import java.util.regex.Pattern + +sealed abstract class CsvRowDelim(val value: String, val alternate: Option[String] = None) +object CsvRowDelim { + case class Custom(delim: String) extends CsvRowDelim(delim) + case object Unix extends CsvRowDelim("\n") + case object Windows extends CsvRowDelim("\r\n") + case object Both extends CsvRowDelim("\n", Some("\r\n")) +} + +sealed trait CsvFormatStrategy { + def withSeparator(separator: String): CsvFormatStrategy + def withQuote(quote: String): CsvFormatStrategy + def withQuoteEscape(quoteEscape: String): CsvFormatStrategy + def withEmpty(empty: String): CsvFormatStrategy + def withInvalid(invalid: String): CsvFormatStrategy + def withHeader(header: Boolean): CsvFormatStrategy + def withRowDelim(rowDelim: CsvRowDelim): CsvFormatStrategy + def withRowDelim(rowDelim: String): CsvFormatStrategy +} + +trait GuessCsvFormat extends CsvFormatStrategy { + + /** + * Makes a guess at the format of the CSV accessed by `reader`. This returns + * the format, as well as the a new pushback reader to be used in place of + * `reader`. The original reader will have some data read out of it. The + * returned reader will contain all the original reader's data. + */ + def apply(reader: Reader): (CsvFormat, Reader) = { + val reader0 = new PushbackReader(reader, Csv.BufferSize) + val buffer = new Array[Char](Csv.BufferSize) + val len = reader0.read(buffer) + reader0.unread(buffer, 0, len) + + val chunk = new String(buffer, 0, len) + val format = apply(chunk) + (format, reader0) + } + + /** + * Given the first part of a CSV file, return a guess at the format. + */ + def apply(str: String): CsvFormat +} + +case class CsvFormat( + /** The delimiter that separates fields within the rows. */ + separator: String, + + /** The character/string that indicates the beginning/end of a quoted value. */ + quote: String = "\"", + + /** The string that is used to escape a quote character, within a quote. */ + quoteEscape: String = "\"", + + /** The value of an empty field (common values are - or ?). */ + empty: String = "", + + /** The value of an invalid field. Empty values take precedence, so setting + * this to the same value as `empty` essentially disabled invalid values. */ + invalid: String = "", + + /** Indicates whether or not the CSV's first row is actually a header. */ + header: Boolean = false, + + /** The delimiter used to separate row. */ + rowDelim: CsvRowDelim = CsvRowDelim.Both, + + /** If true, allow row delimiters within quotes, otherwise they are treated + * as an error. */ + allowRowDelimInQuotes: Boolean = true +) extends CsvFormatStrategy { + val escapedQuote = quoteEscape + quote + + override def toString: String = + s"""CsvFormat(separator = "$separator", quote = "$quote", quoteEscape = "$quoteEscape", empty = "$empty", invalid = "$invalid", header = $header, rowDelim = $rowDelim, allowRowDelimInQuotes = $allowRowDelimInQuotes)""" + + /** + * Replaces all instances of \r\n with \n, then escapes all quotes and wraps + * the string in quotes. + */ + def escape(text: String): String = { + val text0 = text.replace("\r\n", "\n").replace(quote, escapedQuote) + s"${quote}$text0${quote}" + } + + /** + * Renders a single cell of data, escaping the value if necessary. + */ + def render(text: String): String = { + if ((text contains '\n') || + (text contains separator) || + (text contains quote)) escape(text) + else text + } + + def withSeparator(separator: String): CsvFormat = copy(separator = separator) + def withQuote(quote: String): CsvFormat = copy(quote = quote) + def withQuoteEscape(quoteEscape: String): CsvFormat = copy(quoteEscape = quoteEscape) + def withEmpty(empty: String): CsvFormat = copy(empty = empty) + def withInvalid(invalid: String): CsvFormat = copy(invalid = invalid) + def withHeader(header: Boolean): CsvFormat = copy(header = header) + def withRowDelim(rowDelim: CsvRowDelim): CsvFormat = copy(rowDelim = rowDelim) + def withRowDelim(rowDelim: String): CsvFormat = copy(rowDelim = CsvRowDelim.Custom(rowDelim)) +} + +object CsvFormat { + val CSV = CsvFormat(",") + val TSV = CsvFormat("\t") + + val Guess = Partial(header = Some(false)) + + case class Partial( + separator: Option[String] = None, + quote: Option[String] = None, + quoteEscape: Option[String] = None, + empty: Option[String] = None, + invalid: Option[String] = None, + header: Option[Boolean] = None, + rowDelim: Option[CsvRowDelim] = None, + allowRowDelimInQuotes: Boolean = true + ) extends GuessCsvFormat { + + def withSeparator(separator: String): Partial = copy(separator = Some(separator)) + def withQuote(quote: String): Partial = copy(quote = Some(quote)) + def withQuoteEscape(quoteEscape: String): Partial = copy(quoteEscape = Some(quoteEscape)) + def withEmpty(empty: String): Partial = copy(empty = Some(empty)) + def withInvalid(invalid: String): Partial = copy(invalid = Some(invalid)) + def withHeader(header: Boolean): Partial = copy(header = Some(header)) + def withRowDelim(rowDelim: CsvRowDelim): Partial = copy(rowDelim = Some(rowDelim)) + def withRowDelim(rowDelim: String): Partial = copy(rowDelim = Some(CsvRowDelim.Custom(rowDelim))) + + /** + * Performs a very naive guess of the CsvFormat. This uses weighted + * frequencies of occurences of common separators, row-delimiters, quotes, + * quote escapes, etc. and simply selects the max for each. For empty + * values, it uses the frequency of the the possible empty values within + * the cells. + * + * This supports: + * + * * \r\n and \n as row delimiters, + * * ',', '\t', ';', and '|' as field delimiters, + * * '"', and ''' as quote delimiter, + * * the quote delimiter or \ for quote escapes, + * * '', '?', '-', 'N/A', and 'NA' as empty values, and + * * 'N/M' and 'NM' as invalid values. + * + * Headers are guessed by using the cosine similarity of the frequency of + * characters (except quotes/field delimiters) between the first row and + * all subsequent rows. Values below 0.5 will result in a header being + * inferred. + */ + def apply(str: String): CsvFormat = { + def count(ndl: String): Int = { + def check(i: Int, j: Int = 0): Boolean = + if (j >= ndl.length) true + else if (i < str.length && str.charAt(i) == ndl.charAt(j)) check(i + 1, j + 1) + else false + + def loop(i: Int, cnt: Int): Int = + if (i < str.length) { + loop(i + 1, if (check(i)) cnt + 1 else cnt) + } else cnt + + loop(0, 0) + } + + def choose(weightedOptions: (String, Double)*)(f: String => Int): String = { + val weights = Map(weightedOptions: _*) + val (best, weight) = weights.maxBy { case (c, w) => w * f(c) } + if (weight > 0) best else weights.maxBy(_._2)._1 + } + + val rowDelim0 = rowDelim.getOrElse { + val windCnt = count("\r\n") + val unixCnt = count("\n") + + if ((windCnt < 4 * unixCnt) && (unixCnt < 4 * windCnt)) CsvRowDelim.Both + else if (windCnt < 4 * unixCnt) CsvRowDelim.Unix + else CsvRowDelim.Windows + } + val separator0 = separator.getOrElse { + choose("," -> 2.0, "\t" -> 3.0, ";" -> 2.0, "|" -> 1.0)(count) + } + val quote0 = quote.getOrElse(choose("\"" -> 1.2, "\'" -> 1)(count)) + val quoteEscape0 = choose(s"$quote0$quote0" -> 1.1, s"\\$quote0" -> 1)(count).dropRight(quote0.length) + + val cells = for { + row0 <- str.split(Pattern.quote(rowDelim0.value)) + row <- rowDelim0.alternate.fold(Array(row0)) { alt => + row0.split(Pattern.quote(alt)) + } + cell <- row.split(Pattern.quote(separator0)) + } yield cell + def matches(value: String): Int = cells.filter(_ == value).size + val empty0 = empty.getOrElse { + choose("" -> 3, "?" -> 2, "-" -> 2, "N/A" -> 1, "NA" -> 1)(matches) + } + val invalid0 = invalid.getOrElse { + if (matches("N/M") > 1) "N/M" else empty0 + } + + val header0 = header.getOrElse(hasHeader(str, rowDelim0.value, separator0, quote0)) + + CsvFormat(separator0, quote0, quoteEscape0, empty0, invalid0, header0, rowDelim0, allowRowDelimInQuotes) + } + + def hasHeader(chunk: String, rowDelim: String, separator: String, quote: String): Boolean = { + import spire.std.map._ + import spire.std.double._ + import spire.syntax.all._ + + def mkVec(s: String): Map[Char, Double] = + s.groupBy(c => c).map { case (k, v) => k -> v.length.toDouble }.normalize + + def similarity[K](x: Map[K, Double], y: Map[K, Double]): Double = (x dot y) / (x.norm * y.norm) + + val headerEnd = chunk.indexOf(rowDelim) + if (headerEnd > 0) { + val (hdr, rows) = chunk.replace(separator, "").replace(quote, "").splitAt(headerEnd) + println(s"header = ${similarity(mkVec(hdr), mkVec(rows))}") + similarity(mkVec(hdr), mkVec(rows)) < 0.5 + } else { + false + } + } + } +} diff --git a/framian/src/main/scala/framian/csv/CsvParser.scala b/framian/src/main/scala/framian/csv/CsvParser.scala new file mode 100644 index 0000000..f355a2a --- /dev/null +++ b/framian/src/main/scala/framian/csv/CsvParser.scala @@ -0,0 +1,293 @@ +package framian.csv + +import java.nio.charset.{ Charset, StandardCharsets } +import java.io.File +import java.io.{ InputStream, FileInputStream } +import java.io.{ Reader, InputStreamReader } + +case class CsvParser(format: CsvFormat) { + import ParserState._ + import Instr._ + + private def removeRowDelim(context: String): String = { + def dropTail(tail: String): Option[String] = + if (context.endsWith(tail)) Some(context.dropRight(tail.length)) + else None + + dropTail(format.rowDelim.value). + orElse(format.rowDelim.alternate.flatMap(dropTail)). + getOrElse(context) + } + + def parseResource[A](a: A, close: A => Unit)(read: A => Option[String]): Csv = { + def loop(s0: ParserState, fail: Option[Fail], row: Long, acc: Vector[Either[CsvError, CsvRow]]): Csv = { + val (s1, instr) = parse(s0) + + instr match { + case Emit(cells) => + loop(s1, fail, row + 1, acc :+ Right(cells)) + + case f @ Fail(_, _) => + loop(s1, Some(f), row, acc) + + case Resume => + fail match { + case Some(Fail(msg, pos)) => + val context = removeRowDelim(s1.input.substring(s0.rowStart, s1.rowStart)) + val error = CsvError(msg, s0.rowStart, pos, context, row, pos - s0.rowStart + 1) + loop(s1, None, row + 1, acc :+ Left(error)) + + case None => + loop(s1, None, row, acc) + } + + case NeedInput => + read(a) match { + case Some(chunk) => + loop(s1.mapInput(_.append(chunk)), fail, row, acc) + case None => + loop(s1.mapInput(_.finished), fail, row, acc) + } + + case Done => + val csv = UnlabeledCsv(format, acc) + if (format.header) csv.labeled else csv + } + } + + try { + read(a).map { input0 => + loop(ParseRow(0L, 0L, Input.init(input0)), None, 1L, Vector.empty) + }.getOrElse { + Csv.empty(format) + } + } finally { + try { + close(a) + } catch { case (_: Exception) => + // Do nothing - hopefully letting original exception through. + } + } + } + + def parseReader(reader: Reader): Csv = { + val buffer = new Array[Char](Csv.BufferSize) + parseResource[Reader](reader, _.close()) { reader => + val len = reader.read(buffer) + if (len >= 0) { + Some(new String(buffer, 0, len)) + } else { + None + } + } + } + + def parseInputStream(is: InputStream, charset: Charset = StandardCharsets.UTF_8): Csv = + parseReader(new InputStreamReader(is, charset)) + + def parseFile(file: File, charset: Charset = StandardCharsets.UTF_8): Csv = + parseInputStream(new FileInputStream(file), charset) + + def parseString(input: String): Csv = { + var next: Option[String] = Some(input) + parseResource[Unit]((), _ => ()) { _ => + val chunk = next; next = None; chunk + } + } + + private def parse(state: ParserState): (ParserState, Instr[CsvRow]) = { + import format._ + + val input: Input = state.input + var pos: Long = state.readFrom + def ch: Char = input.charAt(pos) + def endOfInput: Boolean = pos >= input.length + def endOfFile: Boolean = endOfInput && input.isLast + def advance(i: Long = 1): Unit = pos += i + def retreat(i: Long = 1): Unit = pos -= i + + def isFlag(str: String): () => Int = { + def loop(i: Int): Int = + if (i >= str.length) { + retreat(i) + i + } else if (endOfInput) { + retreat(i) + if (endOfFile) 0 else -1 + } else if (str.charAt(i) == ch) { + advance() + loop(i + 1) + } else { + retreat(i) + 0 + } + + () => loop(0) + } + + def either(f0: () => Int, f1: () => Int): () => Int = { () => + val i = f0() + if (i == 0) f1() else i + } + + val isQuote = isFlag(quote) + val isQuoteEscape = isFlag(quoteEscape) + val isSeparator = isFlag(separator) + val isRowDelim = rowDelim.alternate.map { alt => + either(isFlag(rowDelim.value), isFlag(alt)) + }.getOrElse(isFlag(rowDelim.value)) + val isEndOfCell = either(isSeparator, isRowDelim) + def isEscapedQuote() = { + val e = isQuoteEscape() + if (e > 0) { + advance(e) + val q = isQuote() + retreat(e) + if (q > 0) q + e + else q + } else { + e + } + } + + def unquotedCell(): ParseResult[CsvCell] = { + val start = pos + def loop(): ParseResult[CsvCell] = { + val flag = isEndOfCell() + if (flag > 0 || endOfFile) { + val value = input.substring(start, pos) + val csvCell = + if (value == empty) CsvCell.Empty + else if (value == invalid) CsvCell.Invalid + else CsvCell.Data(value) + Emit(csvCell) + } else if (flag == 0) { + advance() + loop() + } else { + NeedInput + } + } + + loop() + } + + def quotedCell(): ParseResult[CsvCell] = { + val start = pos + def loop(): ParseResult[CsvCell] = { + if (endOfInput) { + if (endOfFile) { + Fail("Unmatched quoted string at end of file", pos) + } else { + NeedInput + } + } else { + val d = if (allowRowDelimInQuotes) 0 else isRowDelim() + val e = isEscapedQuote() + val q = isQuote() + + if (d < 0 || e < 0 || q < 0) { + NeedInput + } else if (d > 0) { + Fail("Unmatched quoted string at row delimiter", pos) + } else if (e > 0) { + advance(e) + loop() + } else if (q > 0) { + val escaped = input.substring(start, pos).replace(escapedQuote, quote) + advance(q) + Emit(CsvCell.Data(escaped)) + } else { + advance(1) + loop() + } + } + } + + loop() + } + + def cell(): ParseResult[CsvCell] = { + val q = isQuote() + if (q == 0) { + unquotedCell() + } else if (q > 0) { + advance(q) + quotedCell() + } else { + NeedInput + } + } + + def skipToNextRow(): Boolean = { + val d = isRowDelim() + if (d > 0 || endOfFile) { + advance(d) + true + } else if (d == 0) { + advance(1) + skipToNextRow() + } else { + if (input.isLast) + advance(input.length - pos) + input.isLast + } + } + + def row(rowStart: Long, cells: Vector[CsvCell]): (ParserState, Instr[CsvRow]) = { + val start = pos + def needInput() = (ContinueRow(rowStart, start, cells, input), NeedInput) + + val s = isSeparator() + if (s == 0) { + val r = isRowDelim() + if (r > 0 || endOfFile) { + advance(r) + (ParseRow(pos, pos, input.marked(pos)), Emit(new CsvRow(cells))) + } else if (r == 0) { + (SkipRow(rowStart, pos, input), Fail("Expected separator, row delimiter, or end of file", pos)) + } else { + needInput() + } + } else if (s > 0) { + advance(s) + cell() match { + case Emit(c) => + row(rowStart, cells :+ c) + case f @ Fail(_, _) => + (SkipRow(rowStart, pos, input), f) + case NeedInput => + needInput() + } + } else { + needInput() + } + } + + state match { + case ContinueRow(rowStart, readFrom, partial, _) => + row(rowStart, partial) + + case instr @ ParseRow(rowStart, readFrom, _) => + if (endOfFile) { + (instr, Done) + } else { + cell() match { + case Emit(csvCell) => + row(rowStart, Vector(csvCell)) + case f @ Fail(_, _) => + (SkipRow(rowStart, pos, input), f) + case NeedInput => + (instr, NeedInput) + } + } + + case SkipRow(rowStart, readFrom, _) => + if (skipToNextRow()) { + (ParseRow(pos, pos, input.marked(pos)), Resume) + } else { + (SkipRow(rowStart, pos, input), NeedInput) + } + } + } +} diff --git a/framian/src/main/scala/framian/csv/CsvRow.scala b/framian/src/main/scala/framian/csv/CsvRow.scala new file mode 100644 index 0000000..54c66ad --- /dev/null +++ b/framian/src/main/scala/framian/csv/CsvRow.scala @@ -0,0 +1,26 @@ +package framian +package csv + +/** + * A single row in a CSV file. + */ +final class CsvRow(val cells: Vector[CsvCell]) extends AnyVal { + def text(format: CsvFormat): Vector[String] = cells.map(_ render format) + + def render(format: CsvFormat): String = + cells.iterator map (_ render format) mkString format.separator + + override def toString: String = + cells.mkString("CsvRow(", ", ", ")") +} + +object CsvRow extends (Vector[CsvCell] => CsvRow) { + def apply(cells: Vector[CsvCell]): CsvRow = new CsvRow(cells) + + implicit def csvRowExtractor[Col]: RowExtractor[CsvRow, Col, Variable] = { + import CsvCell.CsvCellColumnTyper // TODO: WHY IS THIS NEEDED! ARGH! + RowExtractor.collectionOf[Vector, CsvCell, Col].map { cells => + CsvRow(cells.map(_.fold[CsvCell](CsvCell.Empty, CsvCell.Invalid)(cell => cell))) + } + } +} diff --git a/framian/src/main/scala/framian/csv/CsvRowExtractor.scala b/framian/src/main/scala/framian/csv/CsvRowExtractor.scala deleted file mode 100644 index 5192d5b..0000000 --- a/framian/src/main/scala/framian/csv/CsvRowExtractor.scala +++ /dev/null @@ -1,132 +0,0 @@ -/* _____ _ - * | ___| __ __ _ _ __ ___ (_) __ _ _ __ - * | |_ | '__/ _` | '_ ` _ \| |/ _` | '_ \ - * | _|| | | (_| | | | | | | | (_| | | | | - * |_| |_| \__,_|_| |_| |_|_|\__,_|_| |_| - * - * Copyright 2014 Pellucid Analytics - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package framian -package csv - -import spire.syntax.monoid._ - -import framian.column._ - -sealed abstract class CsvRowDelim(val value: String) -object CsvRowDelim { - case object Unix extends CsvRowDelim("\n") - case object Windows extends CsvRowDelim("\r\n") -} - -case class CsvFormat( - separator: String, - quote: String = "\"", - quoteEscape: String = "\"", - empty: String = "NA", - invalid: String = "NM", - header: Boolean = true, - rowDelim: CsvRowDelim = CsvRowDelim.Windows -) { - val escapedQuote = quoteEscape + quote - - /** - * Replaces all instances of \r\n with \n, then escapes all quotes and wraps - * the string in quotes. - */ - def escape(text: String): String = { - val text0 = text.replace("\r\n", "\n").replace(quote, escapedQuote) - s"${quote}$text0${quote}" - } - - /** - * Renders a single cell of data, escaping the value if necessary. - */ - def render(text: String): String = { - if ((text contains '\n') || - (text contains separator) || - (text contains quote)) escape(text) - else text - } -} - -object CsvFormat { - val CSV = CsvFormat(",") - val TSV = CsvFormat("\t") -} - -sealed abstract class CsvCell(val render: CsvFormat => String) - -object CsvCell { - case class Number(num: BigDecimal) extends CsvCell(_ render num.toString) - case class Text(value: String) extends CsvCell(_ render value) - case object Empty extends CsvCell(_.empty) - case object Invalid extends CsvCell(_.invalid) - - def fromNonValue(nonValue: NonValue): CsvCell = nonValue match { - case NA => Empty - case NM => Invalid - } - - implicit object CsvCellColumnTyper extends ColumnTyper[CsvCell] { - def cast(col: TypedColumn[_]): Column[CsvCell] = { - val num = col.cast[BigDecimal] map (Number(_): CsvCell) - val text = col.cast[String] map (Text(_): CsvCell) - val any = col.cast[Any] map { a => Text(a.toString): CsvCell } - num orElse text orElse any - } - } -} - -/** - * A single row in a CSV file. - */ -final class CsvRow(val cells: List[CsvCell]) extends AnyVal { - def render(format: CsvFormat): String = - cells.iterator map (_ render format) mkString format.separator -} - -object CsvRow { - def apply(cells: List[CsvCell]): CsvRow = new CsvRow(cells) - - implicit object CsvRowExtractor extends RowExtractor[CsvRow, String, Variable] { - type P = List[Column[CsvCell]] - def prepare(cols: Series[String, UntypedColumn], keys: List[String]): Option[List[Column[CsvCell]]] = - Some(keys flatMap { key => cols(key).value.map(_.cast[CsvCell](CsvCell.CsvCellColumnTyper)) }) - - def extract(row: Int, cols: List[Column[CsvCell]]): Cell[CsvRow] = - Value(CsvRow(cols map { _.foldRow(row)(CsvCell.Empty, CsvCell.Invalid, a => a) })) - } -} - -final case class Csv(header: Option[List[String]], rows: List[CsvRow]) { - def render(format: CsvFormat): String = { - val full = header filter (_ => format.header) map { headings => - CsvRow(headings map (CsvCell.Text(_))) :: rows - } getOrElse rows - full.iterator map (_ render format) mkString format.rowDelim.value - } -} - -object Csv { - def fromFrame(frame: Frame[_, String]): Csv = { - val header = frame.colIndex.toList map (_._1) - val rows = frame.get(Cols.all.as[CsvRow]).denseIterator.map { - case (_, row) => row - }.toList - Csv(Some(header), rows) - } -} diff --git a/framian/src/main/scala/framian/csv/Input.scala b/framian/src/main/scala/framian/csv/Input.scala new file mode 100644 index 0000000..2904f8b --- /dev/null +++ b/framian/src/main/scala/framian/csv/Input.scala @@ -0,0 +1,46 @@ +package framian.csv + +case class Input(offset: Long, data: String, isLast: Boolean, mark: Long) { + private def check(i: Long): Int = if ((i < offset) || (i > (offset + data.length))) { + throw new IndexOutOfBoundsException() + } else { + val j = i - offset + if (j <= Int.MaxValue) { + j.toInt + } else { + throw new IndexOutOfBoundsException() + } + } + + def charAt(i: Long): Char = data.charAt(check(i)) + + def length: Long = offset + data.length + + def substring(from: Long, until: Long): String = + data.substring(check(from), check(until)) + + def marked(pos: Long): Input = + Input(offset, data, isLast, pos) + + private def trim: Input = if (mark > offset) { + val next = spire.math.min(mark - offset, data.length.toLong).toInt + val tail = data.substring(next) + val offset0 = offset + next + Input(offset0, tail, isLast, offset0) + } else this + + def append(chunk: String, last: Boolean = false): Input = + if (mark > offset) trim.append(chunk, last) + else if (chunk.isEmpty) Input(offset, data, last, mark) + else Input(offset, data + chunk, last, mark) + + def finished: Input = Input(offset, data, true, mark) +} + +object Input { + def fromString(str: String): Input = + Input(0, str, true, 0) + + def init(str: String): Input = + Input(0, str, false, 0) +} diff --git a/framian/src/main/scala/framian/csv/ParseResult.scala b/framian/src/main/scala/framian/csv/ParseResult.scala new file mode 100644 index 0000000..3925013 --- /dev/null +++ b/framian/src/main/scala/framian/csv/ParseResult.scala @@ -0,0 +1,14 @@ +package framian.csv + +sealed trait Instr[+A] + +object Instr { + sealed trait ParseResult[+A] extends Instr[A] + + case class Emit[+A](value: A) extends ParseResult[A] + case class Fail(message: String, pos: Long) extends ParseResult[Nothing] + case object NeedInput extends ParseResult[Nothing] + + case object Resume extends Instr[Nothing] + case object Done extends Instr[Nothing] +} diff --git a/framian/src/main/scala/framian/csv/ParserState.scala b/framian/src/main/scala/framian/csv/ParserState.scala new file mode 100644 index 0000000..4b3449e --- /dev/null +++ b/framian/src/main/scala/framian/csv/ParserState.scala @@ -0,0 +1,23 @@ +package framian.csv + +sealed trait ParserState { + import ParserState._ + + def rowStart: Long + def readFrom: Long + def input: Input + + def withInput(input0: Input): ParserState = this match { + case ContinueRow(_, _, partial, _) => ContinueRow(rowStart, readFrom, partial, input0) + case SkipRow(_, _, _) => SkipRow(rowStart, readFrom, input0) + case ParseRow(_, _, _) => ParseRow(rowStart, readFrom, input0) + } + + def mapInput(f: Input => Input): ParserState = withInput(f(input)) +} + +object ParserState { + case class ContinueRow(rowStart: Long, readFrom: Long, partial: Vector[CsvCell], input: Input) extends ParserState + case class SkipRow(rowStart: Long, readFrom: Long, input: Input) extends ParserState + case class ParseRow(rowStart: Long, readFrom: Long, input: Input) extends ParserState +} diff --git a/framian/src/test/resources/csvs/auto-mpg-test.tsv b/framian/src/test/resources/csvs/auto-mpg-test.tsv index 5c6fa70..903c077 100644 --- a/framian/src/test/resources/csvs/auto-mpg-test.tsv +++ b/framian/src/test/resources/csvs/auto-mpg-test.tsv @@ -1,5 +1,5 @@ -18.0 8 307.0 130.0 3504. 12.0 70 1 "chevrolet chevelle malibu" -15.0 8 350.0 165.0 3693. 11.5 70 1 "buick skylark 320" -18.0 8 318.0 150.0 3436. 11.0 70 1 "plymouth satellite" -16.0 8 304.0 150.0 3433. 12.0 70 1 "amc rebel sst" -17.0 8 302.0 140.0 3449. 10.5 70 1 "ford torino" +18.0 8 307.0 130.0 3504. 12.0 70 1 "chevrolet chevelle malibu" +15.0 8 350.0 165.0 3693. 11.5 70 1 "buick skylark 320" +18.0 8 318.0 150.0 3436. 11.0 70 1 "plymouth satellite" +16.0 8 304.0 150.0 3433. 12.0 70 1 "amc rebel sst" +17.0 8 302.0 140.0 3449. 10.5 70 1 "ford torino" diff --git a/framian/src/test/scala/framian/CsvSpec.scala b/framian/src/test/scala/framian/CsvSpec.scala index b8d0dbb..b8db6b9 100644 --- a/framian/src/test/scala/framian/CsvSpec.scala +++ b/framian/src/test/scala/framian/CsvSpec.scala @@ -1,5 +1,5 @@ package framian -package utilities +package csv import org.specs2.mutable._ @@ -19,128 +19,145 @@ class CsvSpec extends Specification { val airPassengersBadComma = csvRoot +"AirPassengers-badcomma.csv" val autoMPG = csvRoot +"auto-mpg-test.tsv" - val defaultRowIndex = Index.fromKeys("0", "1", "2", "3", "4") - val withColumnRowIndex = Index.fromKeys("0", "1", "2", "3") - val defaultAPColumnIndex = Index.fromKeys("0", "1", "2") - - val defaultAirPassengers = Frame.fromRows( - "" :: "time" :: "AirPassengers" :: HNil, - "1" :: "1949" :: "112" :: HNil, - "2" :: "1949.08333333333" :: "118" :: HNil, - "3" :: "1949.16666666667" :: "132" :: HNil, - "4" :: "1949.25" :: "129" :: HNil) - .withColIndex(defaultAPColumnIndex) - .withRowIndex(defaultRowIndex) + val defaultRowIndex = Index.fromKeys(0, 1, 2, 3, 4) + val withColumnRowIndex = Index.fromKeys(0, 1, 2, 3) + val defaultAPColumnIndex = Index.fromKeys(0, 1, 2) + + val defaultAirPassengers = ColOrientedFrame( + Index.fromKeys(0, 1, 2, 3, 4), + Series( + 0 -> TypedColumn(Column[Int]( + NA, + Value(1), + Value(2), + Value(3), + Value(4)) + ).orElse(TypedColumn(Column[String]( + Value("") + ))), + 1 -> TypedColumn(Column[BigDecimal]( + NA, + Value(BigDecimal("1949")), + Value(BigDecimal("1949.08333333333")), + Value(BigDecimal("1949.16666666667")), + Value(BigDecimal("1949.25"))) + ).orElse(TypedColumn(Column[String]( + Value("time") + ))), + 2 -> TypedColumn(Column[BigDecimal]( + NA, + Value(BigDecimal("112")), + Value(BigDecimal("118")), + Value(BigDecimal("132")), + Value(BigDecimal("129"))) + ).orElse(TypedColumn(Column[String]( + Value("AirPassengers") + ))))) val columnAirPassengers = Frame.fromRows( - "1" :: "1949" :: "112" :: HNil, - "2" :: "1949.08333333333" :: "118" :: HNil, - "3" :: "1949.16666666667" :: "132" :: HNil, - "4" :: "1949.25" :: "129" :: HNil) + 1 :: BigDecimal(1949) :: 112 :: HNil, + 2 :: BigDecimal(1949.08333333333) :: 118 :: HNil, + 3 :: BigDecimal(1949.16666666667) :: 132 :: HNil, + 4 :: BigDecimal(1949.25) :: 129 :: HNil) .withColIndex(Index.fromKeys("", "time", "AirPassengers")) .withRowIndex(withColumnRowIndex) - val rowAirPassengers = (Frame.fromRows( - "time" :: "AirPassengers" :: HNil, - "1949" :: "112" :: HNil, - "1949.08333333333" :: "118" :: HNil, - "1949.16666666667" :: "132" :: HNil, - "1949.25" :: "129" :: HNil) - .withColIndex(Index.fromKeys("0", "1")) - .withRowIndex(Index.fromKeys("", "1", "2", "3", "4"))) - val correctAirPassengers = Frame.fromRows( - "1949" :: "112" :: HNil, - "1949.08333333333" :: "118" :: HNil, - "1949.16666666667" :: "132" :: HNil, - "1949.25" :: "129" :: HNil) - .withColIndex(Index.fromKeys("time", "AirPassengers")) - .withRowIndex(Index.fromKeys("1", "2", "3", "4")) val defaultMPG = Frame.fromRows( - "18.0" :: "8" :: "307.0" :: "130.0" :: "3504." :: "12.0" :: "70" :: "1" :: "chevrolet chevelle malibu" :: HNil, - "15.0" :: "8" :: "350.0" :: "165.0" :: "3693." :: "11.5" :: "70" :: "1" :: "buick skylark 320" :: HNil, - "18.0" :: "8" :: "318.0" :: "150.0" :: "3436." :: "11.0" :: "70" :: "1" :: "plymouth satellite" :: HNil, - "16.0" :: "8" :: "304.0" :: "150.0" :: "3433." :: "12.0" :: "70" :: "1" :: "amc rebel sst" :: HNil, - "17.0" :: "8" :: "302.0" :: "140.0" :: "3449." :: "10.5" :: "70" :: "1" :: "ford torino" :: HNil) - .withRowIndex(defaultRowIndex) - .withColIndex(Index.fromKeys("0", "1", "2", "3", "4", "5", "6", "7", "8")) - val withRowIndexMPG = Frame.fromRows( - "18.0" :: "8" :: "307.0" :: "130.0" :: "3504." :: "12.0" :: "70" :: "1" :: HNil, - "15.0" :: "8" :: "350.0" :: "165.0" :: "3693." :: "11.5" :: "70" :: "1" :: HNil, - "18.0" :: "8" :: "318.0" :: "150.0" :: "3436." :: "11.0" :: "70" :: "1" :: HNil, - "16.0" :: "8" :: "304.0" :: "150.0" :: "3433." :: "12.0" :: "70" :: "1" :: HNil, - "17.0" :: "8" :: "302.0" :: "140.0" :: "3449." :: "10.5" :: "70" :: "1" :: HNil) - .withRowIndex(Index.fromKeys( - "chevrolet chevelle malibu", "buick skylark 320", - "plymouth satellite", "amc rebel sst", "ford torino")) - .withColIndex(Index.fromKeys("0", "1", "2", "3", "4", "5", "6", "7")) - val customColsMPG = Frame.fromRows( - "18.0" :: "8" :: "307.0" :: HNil, - "15.0" :: "8" :: "350.0" :: HNil, - "18.0" :: "8" :: "318.0" :: HNil, - "16.0" :: "8" :: "304.0" :: HNil, - "17.0" :: "8" :: "302.0" :: HNil) - .withRowIndex(defaultRowIndex) - .withColIndex(Index.fromKeys("0", "1", "2")) - val withRowIndexCustomColsMPG = Frame.fromRows( - "18.0" :: "8" :: "307.0" :: HNil, - "15.0" :: "8" :: "350.0" :: HNil, - "18.0" :: "8" :: "318.0" :: HNil, - "16.0" :: "8" :: "304.0" :: HNil, - "17.0" :: "8" :: "302.0" :: HNil) - .withRowIndex(Index.fromKeys( - "chevrolet chevelle malibu", "buick skylark 320", - "plymouth satellite", "amc rebel sst", "ford torino")) - .withColIndex(Index.fromKeys("0", "1", "2")) - - val apBadComma = Frame.fromRows( - "" :: "FlightName" :: "AirPassengers" :: HNil, - "1" :: "ABCD111" :: "112" :: HNil, - "2" :: "Delta20394" :: "118" :: HNil, - "3" :: "FLIGHTTOHELL, REALLY" :: "132" :: HNil, - "4" :: "United666" :: "129" :: HNil) - .withColIndex(defaultAPColumnIndex) + 18.0 :: 8 :: 307.0 :: 130.0 :: 3504 :: 12.0 :: 70 :: 1 :: "chevrolet chevelle malibu" :: HNil, + 15.0 :: 8 :: 350.0 :: 165.0 :: 3693 :: 11.5 :: 70 :: 1 :: "buick skylark 320" :: HNil, + 18.0 :: 8 :: 318.0 :: 150.0 :: 3436 :: 11.0 :: 70 :: 1 :: "plymouth satellite" :: HNil, + 16.0 :: 8 :: 304.0 :: 150.0 :: 3433 :: 12.0 :: 70 :: 1 :: "amc rebel sst" :: HNil, + 17.0 :: 8 :: 302.0 :: 140.0 :: 3449 :: 10.5 :: 70 :: 1 :: "ford torino" :: HNil) .withRowIndex(defaultRowIndex) + .withColIndex(Index.fromKeys(0, 1, 2, 3, 4, 5, 6, 7, 8)) + + "CsvParser" should { + "parse air passengers as unlabeled CSV" in { + Csv.parsePath(airPassengers).unlabeled.toFrame must_== defaultAirPassengers + } - def getFile(loc:String) = new File(loc) + "parse air passengers as labeled CSV" in { + Csv.parsePath(airPassengers).labeled.toFrame must_== columnAirPassengers + } + + "parse autoMPG as unlabeled TSV" in { + Csv.parsePath(autoMPG).unlabeled.toFrame must_== defaultMPG + } - "Csv parser" should { - "parse air passengers with default settings" in { - loadFrameFromCSV(getFile(airPassengers)) must_== defaultAirPassengers - } + "parse CSV with separator in quote" in { + val data = """a,"b","c,d"|"e,f,g"""" + val csv = Csv.parseString(data, CsvFormat.Guess.withRowDelim("|")) + val frame = csv.unlabeled.toFrame + frame.getRow(0) must_== Some(Rec(0 -> "a", 1 -> "b", 2 -> "c,d")) + frame[String](1, 0) must_== Value("e,f,g") + frame[String](1, 1) must_== NA + frame[String](1, 2) must_== NA + } - "parse air passengers with just column headers" in { - loadFrameFromCSV(getFile(airPassengers), columnIndex = true) must_== columnAirPassengers + import CsvCell._ + + val TestFormat = CsvFormat( + separator = ",", + quote = "'", + quoteEscape = "'", + empty = "N/A", + invalid = "N/M", + header = false, + rowDelim = CsvRowDelim.Custom("|"), + allowRowDelimInQuotes = true + ) + + "parse escaped quotes" in { + Csv.parseString( + "a,'''','c'''|'''''d''''', ''''", + TestFormat + ).rows must_== Vector( + Right(CsvRow(Vector(Data("a"), Data("'"), Data("c'")))), + Right(CsvRow(Vector(Data("''d''"), Data(" ''''")))) + ) } - "parse air passengers with just row headers" in { - loadFrameFromCSV(getFile(airPassengers), columnIndex = false, rowIndex = 0) must_== rowAirPassengers + "respect CsvFormat separator" in { + Csv.parseString("a,b,c|d,e,f", TestFormat).rows must_== + Csv.parseString("a;b;c|d;e;f", TestFormat.withSeparator(";")).rows } - "parse air passengers with row and column headers" in { - loadFrameFromCSV(getFile(airPassengers), columnIndex = true, rowIndex = 0) must_== correctAirPassengers + "respect CsvFormat quote" in { + Csv.parseString("'a,b','b'|d,e", TestFormat).rows must_== + Csv.parseString("^a,b^,^b^|d,e", TestFormat.withQuote("^")).rows } - "fail to parse autoMPG with default settings" in { - loadFrameFromCSV(getFile(autoMPG)) must throwA[AssertionError] + "respect CsvFormat quote escape" in { + Csv.parseString("'a''b',''''|' '", TestFormat).rows must_== + Csv.parseString("'a\\'b','\\''|' '", TestFormat.withQuoteEscape("\\")).rows } - "parse autoMPG with delimiter as tab but otherwise default settings" in { - loadFrameFromCSV(getFile(autoMPG), delimiter = "\t") must_== defaultMPG + "respect CsvFormat empty" in { + Csv.parseString("a,N/A,b|N/A,N/A", TestFormat).rows must_== + Csv.parseString("a,,b|,", TestFormat.withEmpty("")).rows } - "parse autoMPG with delimiter as tab and row index as column 8" in { - loadFrameFromCSV(getFile(autoMPG), delimiter = "\t", rowIndex = 8) must_== withRowIndexMPG + "respect CsvFormat invalid" in { + Csv.parseString("a,N/M,b|N/M,N/M", TestFormat).rows must_== + Csv.parseString("a,nm,b|nm,nm", TestFormat.withInvalid("nm")).rows } - "parse autoMPG with delimiter as tab and just take first three columns" in { - loadFrameFromCSV(getFile(autoMPG), delimiter = "\t", columns = List(0, 1, 2)) must_== customColsMPG + "respect CsvFormat row delimiter" in { + Csv.parseString("a,b|c,d|e,f", TestFormat).rows must_== + Csv.parseString("a,b\nc,d\ne,f", TestFormat.withRowDelim(CsvRowDelim.Unix)).rows } - "parse autoMPG with delimiter as tab, column 8 as row index and just take first three columns" in { - loadFrameFromCSV(getFile(autoMPG), delimiter = "\t", rowIndex = 8, columns = List(0, 1, 2)) must_== withRowIndexCustomColsMPG + "parse CSV with row delimiter in quote" in { + Csv.parseString("a,'b|c'|'d|e',f", TestFormat).rows must_== Vector( + Right(CsvRow(Vector(Data("a"), Data("b|c")))), + Right(CsvRow(Vector(Data("d|e"), Data("f"))))) } - "parse a file that has the delimiter within a column within the chosen quote type" in { - loadFrameFromCSV(getFile(airPassengersBadComma)) must_== apBadComma + "parser respects whitespace" in { + val data = " a , , 'a','b'| b ,c , " + val csv = Csv.parseString(data, CsvFormat.Guess.withRowDelim("|")) + csv.rows must_== Vector( + Right(CsvRow(Vector(Data(" a "), Data(" "), Data(" 'a'"), Data("b")))), + Right(CsvRow(Vector(Data(" b "), Data("c "), Data(" "))))) } } }