From cad89b5053d3973310d7cc3f99faf7d1a7e2935f Mon Sep 17 00:00:00 2001 From: Devin Smith Date: Tue, 1 Feb 2022 16:01:56 -0800 Subject: [PATCH 1/7] Depend on deephaven-csv --- build.gradle | 6 + extensions/csv/build.gradle | 4 +- .../main/java/io/deephaven/csv/CsvSpecs.java | 865 ------- .../main/java/io/deephaven/csv/CsvTools.java | 364 ++- .../java/io/deephaven/csv/InferenceSpecs.java | 136 -- .../deephaven/csv/containers/ByteSlice.java | 109 - .../csv/containers/GrowableByteBuffer.java | 72 - .../densestorage/DenseStorageConstants.java | 34 - .../csv/densestorage/DenseStorageReader.java | 69 - .../csv/densestorage/DenseStorageWriter.java | 147 -- .../deephaven/csv/densestorage/QueueNode.java | 28 - .../csv/densestorage/QueueReader.java | 221 -- .../csv/densestorage/QueueWriter.java | 258 --- .../csv/parsers/BooleanAsByteParser.java | 60 - .../io/deephaven/csv/parsers/ByteParser.java | 75 - .../io/deephaven/csv/parsers/CharParser.java | 70 - .../csv/parsers/DateTimeAsLongParser.java | 68 - .../deephaven/csv/parsers/DoubleParser.java | 68 - .../csv/parsers/FloatFastParser.java | 76 - .../csv/parsers/FloatStrictParser.java | 71 - .../io/deephaven/csv/parsers/IntParser.java | 77 - .../deephaven/csv/parsers/IteratorHolder.java | 78 - .../io/deephaven/csv/parsers/LongParser.java | 71 - .../java/io/deephaven/csv/parsers/Parser.java | 146 -- .../io/deephaven/csv/parsers/Parsers.java | 111 - .../io/deephaven/csv/parsers/ShortParser.java | 75 - .../deephaven/csv/parsers/StringParser.java | 58 - .../csv/parsers/TimestampMicrosParser.java | 12 - .../csv/parsers/TimestampMillisParser.java | 12 - .../csv/parsers/TimestampNanosParser.java | 12 - .../csv/parsers/TimestampParserBase.java | 89 - .../csv/parsers/TimestampSecondsParser.java | 12 - .../io/deephaven/csv/reading/CellGrabber.java | 347 --- .../io/deephaven/csv/reading/CsvReader.java | 564 ----- .../reading/ParseDenseStorageToColumn.java | 376 --- .../csv/reading/ParseInputToDenseStorage.java | 139 -- .../deephaven/csv/reading/TypeConverter.java | 229 -- .../java/io/deephaven/csv/sinks/Sink.java | 56 - .../io/deephaven/csv/sinks/SinkFactory.java | 263 --- .../java/io/deephaven/csv/sinks/Source.java | 35 - .../csv/tokenization/RangeTests.java | 115 - .../deephaven/csv/tokenization/Tokenizer.java | 690 ------ .../csv/util/CsvReaderException.java | 14 - .../java/io/deephaven/csv/util/Renderer.java | 65 - .../java/io/deephaven/csv/CsvReaderTest.java | 2010 ----------------- .../java/io/deephaven/csv/CsvTestSuite.java | 2 +- .../io/deephaven/csv/DeephavenCsvTest.java | 7 +- 47 files changed, 371 insertions(+), 8095 deletions(-) delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/InferenceSpecs.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/containers/ByteSlice.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/containers/GrowableByteBuffer.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageConstants.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageReader.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageWriter.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueNode.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueReader.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueWriter.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/BooleanAsByteParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/ByteParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/CharParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/DateTimeAsLongParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/DoubleParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatFastParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatStrictParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/IntParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/IteratorHolder.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/LongParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/Parser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/Parsers.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/ShortParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/StringParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMicrosParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMillisParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampNanosParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampParserBase.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampSecondsParser.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/reading/CellGrabber.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/reading/CsvReader.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/reading/ParseInputToDenseStorage.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/reading/TypeConverter.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/sinks/Sink.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/sinks/SinkFactory.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/sinks/Source.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/tokenization/RangeTests.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/tokenization/Tokenizer.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/util/CsvReaderException.java delete mode 100644 extensions/csv/src/main/java/io/deephaven/csv/util/Renderer.java delete mode 100644 extensions/csv/src/test/java/io/deephaven/csv/CsvReaderTest.java diff --git a/build.gradle b/build.gradle index 1b44687f7c8..037a7012834 100644 --- a/build.gradle +++ b/build.gradle @@ -60,6 +60,12 @@ allprojects { includeGroup 'org.apache.kafka' } } + maven { + url "https://s01.oss.sonatype.org/content/repositories/snapshots/" + mavenContent { + snapshotsOnly() + } + } } } diff --git a/extensions/csv/build.gradle b/extensions/csv/build.gradle index ddbebd4cdb4..d76e66784b3 100644 --- a/extensions/csv/build.gradle +++ b/extensions/csv/build.gradle @@ -6,10 +6,10 @@ description 'CSV: Support to read and write engine tables from/to CSV' dependencies { api project(':engine-api') + api 'io.deephaven:deephaven-csv:0.0.1-SNAPSHOT' implementation project(':engine-table'), - project(':engine-base'), - 'ch.randelshofer:fastdoubleparser:0.3.0' + project(':engine-base') Classpaths.inheritImmutables(project) diff --git a/extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java b/extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java deleted file mode 100644 index aeaeaa5e371..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/CsvSpecs.java +++ /dev/null @@ -1,865 +0,0 @@ -package io.deephaven.csv; - -import gnu.trove.map.hash.TIntObjectHashMap; -import io.deephaven.annotations.BuildableStyle; -import io.deephaven.api.util.NameValidator; -import io.deephaven.chunk.*; -import io.deephaven.chunk.attributes.Values; -import io.deephaven.csv.containers.ByteSlice; -import io.deephaven.csv.parsers.Parser; -import io.deephaven.csv.parsers.Parsers; -import io.deephaven.csv.reading.CsvReader; -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.sinks.SinkFactory; -import io.deephaven.csv.sinks.Source; -import io.deephaven.csv.tokenization.RangeTests; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import io.deephaven.engine.rowset.RowSequence; -import io.deephaven.engine.rowset.RowSequenceFactory; -import io.deephaven.engine.rowset.RowSetFactory; -import io.deephaven.engine.rowset.TrackingRowSet; -import io.deephaven.engine.table.*; -import io.deephaven.engine.table.impl.InMemoryTable; -import io.deephaven.engine.table.impl.sources.*; -import io.deephaven.qst.column.header.ColumnHeader; -import io.deephaven.qst.table.NewTable; -import io.deephaven.qst.table.TableHeader; -import io.deephaven.qst.type.*; -import io.deephaven.time.DateTime; -import io.deephaven.time.TimeZone; -import io.deephaven.util.BooleanUtils; -import io.deephaven.util.QueryConstants; -import org.apache.commons.io.input.ReaderInputStream; -import org.apache.commons.lang3.mutable.MutableLong; -import org.apache.commons.lang3.mutable.MutableObject; -import org.immutables.value.Value.Default; -import org.immutables.value.Value.Immutable; - -import java.io.*; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.time.ZoneId; -import java.util.*; - -/** - * A specification object for parsing a CSV, or CSV-like, structure into a {@link NewTable}. - */ -@Immutable -@BuildableStyle -public abstract class CsvSpecs { - - public interface Builder { - Builder header(TableHeader header); - - Builder addHeaders(String... headers); - - Builder addAllHeaders(Iterable headers); - - Builder putHeaderForIndex(int index, String header); - - Builder putParserForName(String columnName, Parser parser); - - Builder putParserForIndex(int index, Parser parser); - - Builder nullValueLiteral(String nullValueLiteral); - - Builder putNullValueLiteralForName(String columnName, String nullValueLiteral); - - Builder putNullValueLiteralForIndex(int index, String nullValueLiteral); - - Builder inference(InferenceSpecs inferenceSpecs); - - Builder hasHeaderRow(boolean hasHeaderRow); - - Builder delimiter(char delimiter); - - Builder quote(char quote); - - Builder ignoreSurroundingSpaces(boolean ignoreSurroundingSpaces); - - Builder trim(boolean trim); - - Builder concurrent(boolean async); - - CsvSpecs build(); - } - - /** - * Creates a builder for {@link CsvSpecs}. - * - * @return the builder - */ - public static Builder builder() { - return ImmutableCsvSpecs.builder(); - } - - /** - * A comma-separated-value delimited format. - * - *

- * Equivalent to {@code builder().build()}. - * - * @return the spec - */ - public static CsvSpecs csv() { - return builder().build(); - } - - /** - * A tab-separated-value delimited format. - * - *

- * Equivalent to {@code builder().delimiter('\t').build()}. - * - * @return the spec - */ - public static CsvSpecs tsv() { - return builder().delimiter('\t').build(); - } - - /** - * A header-less, CSV format. - * - *

- * Equivalent to {@code builder().hasHeaderRow(false).build()}. - * - * @return the spec - */ - public static CsvSpecs headerless() { - return builder().hasHeaderRow(false).build(); - } - - /** - * A header-less, CSV format, with the user providing the {@code header}. - * - *

- * Equivalent to {@code builder().hasHeaderRow(false).header(header).build()}. - * - * @param header the header to use - * @return the spec - */ - public static CsvSpecs headerless(TableHeader header) { - return builder().hasHeaderRow(false).header(header).build(); - } - - public static CsvSpecs fromLegacyFormat(String format) { - if (format == null) { - return CsvSpecs.csv(); - } else if (format.length() == 1) { - return CsvSpecs.builder().delimiter(format.charAt(0)).build(); - } else if ("TRIM".equals(format)) { - return CsvSpecs.builder().trim(true).build(); - } else if ("DEFAULT".equals(format)) { - return CsvSpecs.builder().ignoreSurroundingSpaces(false).build(); - } else if ("TDF".equals(format)) { - return CsvSpecs.tsv(); - } - return null; - } - - /** - * A header, when specified, hints at the parser to use. - * - *

- * To be even more explicit, callers may also use {@link #parserForName()} or {@link #parserForIndex()}. - * - * @return the table header. - */ - public abstract Optional header(); - - /** - * A list of column header names that, when specified, overrides the column names that would otherwise be used. - */ - public abstract List headers(); - - /** - * Header overrides, where the keys are 1-based column indices. Specifying a column header overrides the header that - * would otherwise be used for that specific column. - */ - public abstract Map headerForIndex(); - - /** - * The parsers, where the keys are column names. Specifying a parser for a column forgoes inference for that column. - * - * @return the parsers. - */ - public abstract Map> parserForName(); - - /** - * The parsers, where the keys are 1-based column indices. Specifying a parser for a column forgoes inference for - * that column. - * - * @return the parsers. - */ - public abstract Map> parserForIndex(); - - /** - * The null value literal that is used when it is not overridden for any particular column. - */ - @Default - public String nullValueLiteral() { - return ""; - } - - /** - * The null value literals, where the keys are column names. Specifying a null value literal for a column overrides - * the default null value literal, which is the empty string. - * - * @return the null value literals - */ - public abstract Map nullValueLiteralForName(); - - /** - * The null value literals, where the keys are 1-based column indices. Specifying a null value literal for a column - * overrides the default null value literal, which is the empty string. - * - * @return the null value literals - */ - public abstract Map nullValueLiteralForIndex(); - - /** - * The inference specifications. - * - *

- * By default, is {@link InferenceSpecs#standard()}. - * - * @return the inference specifications - */ - @Default - public InferenceSpecs inference() { - return InferenceSpecs.standard(); - } - - /** - * The header row flag. If {@code true}, the column names of the output table will be inferred from the first row of - * the table. If {@code false}, the column names will be numbered numerically in the format "Column%d" with a - * 1-based index. - * - *

- * Note: if {@link #header()} is specified, it takes precedence over the column names that will be used. - * - *

- * By default is {@code true}. - * - * @return the header row flag - */ - @Default - public boolean hasHeaderRow() { - return true; - } - - /** - * The delimiter character. - * - *

- * By default is ','. - * - * @return the delimiter character - */ - @Default - public char delimiter() { - return ','; - } - - /** - * The quote character. - * - *

- * By default is '"'. - * - * @return the quote character - */ - @Default - public char quote() { - return '"'; - } - - - /** - * The ignore surrounding spaces flag, whether to trim leading and trailing blanks from non-quoted values. - * - *

- * By default is {@code true} - * - * @return the ignore surrounding spaces flag - */ - @Default - public boolean ignoreSurroundingSpaces() { - return true; - } - - /** - * The trim flag, whether to trim leading and trailing blanks from inside quoted values. - * - *

- * By default is {@code false}. - * - * @return the trim flag - */ - @Default - public boolean trim() { - return false; - } - - /** - * The character set. - * - *

- * By default, is UTF-8. - * - * @return the character set. - */ - @Default - public Charset charset() { - return StandardCharsets.UTF_8; - } - - /** - * Should the CSVReader run its processing steps concurrently on multiple threads for better performance. - * - * @return the concurrent flag - */ - @Default - public boolean concurrent() { - return true; - } - - /** - * Parses {@code string} according to the specifications of {@code this}. - * - * @param string the string - * @return the new table - * @throws CsvReaderException if any sort of failure occurs. - */ - public final Table parse(String string) throws CsvReaderException { - final StringReader reader = new StringReader(string); - final ReaderInputStream inputStream = new ReaderInputStream(reader, StandardCharsets.UTF_8); - return parse(inputStream); - } - - /** - * Parses {@code stream} according to the specifications of {@code this}. The {@code stream} will be closed upon - * return. - * - *

- * Note: this implementation will buffer the {@code stream} internally. - * - * @param stream the stream - * @return the new table - * @throws CsvReaderException if any sort of failure occurs. - */ - public final Table parse(InputStream stream) throws CsvReaderException { - final CsvReader csvReader = configureCsvReader(); - final CsvReader.Result result = csvReader.read(stream, makeMySinkFactory()); - - final String[] columnNames = result.columnNames(); - final Sink[] sinks = result.columns(); - final Map> columns = new LinkedHashMap<>(); - long maxSize = 0; - for (int ii = 0; ii < columnNames.length; ++ii) { - final String columnName = columnNames[ii]; - final MySinkBase sink = (MySinkBase) sinks[ii]; - maxSize = Math.max(maxSize, sink.resultSize()); - columns.put(columnName, sink.result()); - } - final TableDefinition tableDef = TableDefinition.inferFrom(columns); - final TrackingRowSet rowSet = RowSetFactory.flat(maxSize).toTracking(); - return InMemoryTable.from(tableDef, rowSet, columns); - } - - private CsvReader configureCsvReader() { - final CsvReader csvReader = new CsvReader(); - - csvReader.setConcurrent(concurrent()); - csvReader.setIgnoreSurroundingSpaces(ignoreSurroundingSpaces()); - csvReader.setTrim(trim()); - csvReader.setHasHeaders(hasHeaderRow()); - csvReader.setquoteChar(quote()); - csvReader.setFieldDelimiter(delimiter()); - csvReader.setParsers(inference().parsers()); - - for (Map.Entry> entry : parserForName().entrySet()) { - csvReader.setParserFor(entry.getKey(), entry.getValue()); - } - for (Map.Entry> entry : parserForIndex().entrySet()) { - csvReader.setParserFor(entry.getKey(), entry.getValue()); - } - - - csvReader.setNullValueLiteral(nullValueLiteral()); - for (Map.Entry entry : nullValueLiteralForName().entrySet()) { - csvReader.setNullValueLiteralFor(entry.getKey(), entry.getValue()); - } - for (Map.Entry entry : nullValueLiteralForIndex().entrySet()) { - csvReader.setNullValueLiteralFor(entry.getKey(), entry.getValue()); - } - - if (header().isPresent()) { - final List headers = new ArrayList<>(); - for (ColumnHeader ch : header().get()) { - headers.add(ch.name()); - csvReader.setParserFor(ch.name(), typeToParser(ch.componentType())); - } - csvReader.setHeaders(headers); - } - - csvReader.setNullParser(inference().nullParser()); - - csvReader.setCustomTimeZoneParser(new TimeZoneParser()); - - csvReader.setHeaderLegalizer(names -> NameValidator.legalizeColumnNames(names, - s -> s.replaceAll("[- ]", "_"), true)); - csvReader.setHeaderValidator(NameValidator::isValidColumnName); - - return csvReader; - } - - private static abstract class MySinkBase implements Sink { - protected final ArrayBackedColumnSource result; - protected long resultSize; - protected final WritableColumnSource reinterpreted; - protected final ChunkWrapInvoker> chunkWrapInvoker; - - public MySinkBase(ArrayBackedColumnSource result, Class interpClass, - ChunkWrapInvoker> chunkWrapInvoker) { - this.result = result; - this.resultSize = 0; - if (interpClass != null) { - reinterpreted = (WritableColumnSource) result.reinterpret(interpClass); - } else { - reinterpreted = result; - } - this.chunkWrapInvoker = chunkWrapInvoker; - } - - @Override - public final void write(final TARRAY src, final boolean[] isNull, final long destBegin, final long destEnd, - boolean appending_unused) { - if (destBegin == destEnd) { - return; - } - final int size = Math.toIntExact(destEnd - destBegin); - nullFlagsToValues(src, isNull, size); - reinterpreted.ensureCapacity(destEnd); - resultSize = Math.max(resultSize, destEnd); - try (final ChunkSink.FillFromContext context = reinterpreted.makeFillFromContext(size); - final RowSequence range = RowSequenceFactory.forRange(destBegin, destEnd - 1)) { - Chunk chunk = chunkWrapInvoker.apply(src, 0, size); - reinterpreted.fillFromChunk(context, chunk, range); - } - } - - protected abstract void nullFlagsToValues(final TARRAY values, final boolean[] isNull, final int size); - - public ArrayBackedColumnSource result() { - return result; - } - - public long resultSize() { - return resultSize; - } - - protected interface ChunkWrapInvoker { - TRESULT apply(final TARRAY data, final int offset, final int capacity); - } - } - - private static abstract class MySourceAndSinkBase extends MySinkBase - implements Source, Sink { - private final ChunkWrapInvoker> writableChunkWrapInvoker; - - public MySourceAndSinkBase(ArrayBackedColumnSource result, Class interpClass, - ChunkWrapInvoker> chunkWrapInvoker, - ChunkWrapInvoker> writeableChunkWrapInvoker) { - super(result, interpClass, chunkWrapInvoker); - this.writableChunkWrapInvoker = writeableChunkWrapInvoker; - } - - @Override - public final void read(TARRAY dest, boolean[] isNull, long srcBegin, long srcEnd) { - if (srcBegin == srcEnd) { - return; - } - final int size = Math.toIntExact(srcEnd - srcBegin); - try (final ChunkSink.FillContext context = reinterpreted.makeFillContext(size); - final RowSequence range = RowSequenceFactory.forRange(srcBegin, srcEnd - 1)) { - WritableChunk chunk = writableChunkWrapInvoker.apply(dest, 0, size); - reinterpreted.fillChunk(context, chunk, range); - } - valuesToNullFlags(dest, isNull, size); - } - - protected abstract void valuesToNullFlags(final TARRAY values, final boolean[] isNull, final int size); - } - - private static final class MyCharSink extends MySinkBase { - public MyCharSink() { - super(new CharacterArraySource(), null, CharChunk::chunkWrap); - } - - @Override - protected void nullFlagsToValues(final char[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii < size; ++ii) { - if (isNull[ii]) { - values[ii] = QueryConstants.NULL_CHAR; - } - } - } - } - - private static final class MyBooleanAsByteSink extends MySinkBase { - public MyBooleanAsByteSink() { - super(new BooleanArraySource(), byte.class, ByteChunk::chunkWrap); - } - - @Override - protected void nullFlagsToValues(final byte[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii < size; ++ii) { - if (isNull[ii]) { - values[ii] = BooleanUtils.NULL_BOOLEAN_AS_BYTE; - } - } - } - } - - private static final class MyByteSink extends MySourceAndSinkBase { - public MyByteSink() { - super(new ByteArraySource(), null, ByteChunk::chunkWrap, WritableByteChunk::writableChunkWrap); - } - - @Override - protected void nullFlagsToValues(final byte[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii != size; ++ii) { - if (isNull[ii]) { - values[ii] = QueryConstants.NULL_BYTE; - } - } - } - - @Override - protected void valuesToNullFlags(final byte[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii < size; ++ii) { - isNull[ii] = values[ii] == QueryConstants.NULL_BYTE; - } - } - } - - private static final class MyShortSink extends MySourceAndSinkBase { - public MyShortSink() { - super(new ShortArraySource(), null, ShortChunk::chunkWrap, WritableShortChunk::writableChunkWrap); - } - - @Override - protected void nullFlagsToValues(final short[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii != size; ++ii) { - if (isNull[ii]) { - values[ii] = QueryConstants.NULL_SHORT; - } - } - } - - @Override - protected void valuesToNullFlags(final short[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii < size; ++ii) { - isNull[ii] = values[ii] == QueryConstants.NULL_SHORT; - } - } - } - - private static final class MyIntSink extends MySourceAndSinkBase { - public MyIntSink() { - super(new IntegerArraySource(), null, IntChunk::chunkWrap, WritableIntChunk::writableChunkWrap); - } - - @Override - protected void nullFlagsToValues(final int[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii != size; ++ii) { - if (isNull[ii]) { - values[ii] = QueryConstants.NULL_INT; - } - } - } - - @Override - protected void valuesToNullFlags(final int[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii < size; ++ii) { - isNull[ii] = values[ii] == QueryConstants.NULL_INT; - } - } - } - - private static final class MyLongSink extends MySourceAndSinkBase { - public MyLongSink() { - super(new LongArraySource(), null, LongChunk::chunkWrap, WritableLongChunk::writableChunkWrap); - } - - @Override - protected void nullFlagsToValues(final long[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii != size; ++ii) { - if (isNull[ii]) { - values[ii] = QueryConstants.NULL_LONG; - } - } - } - - @Override - protected void valuesToNullFlags(final long[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii < size; ++ii) { - isNull[ii] = values[ii] == QueryConstants.NULL_LONG; - } - } - - } - - private static final class MyFloatSink extends MySinkBase { - public MyFloatSink() { - super(new FloatArraySource(), null, FloatChunk::chunkWrap); - } - - @Override - protected void nullFlagsToValues(final float[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii != size; ++ii) { - if (isNull[ii]) { - values[ii] = QueryConstants.NULL_FLOAT; - } - } - } - } - - private static final class MyDoubleSink extends MySinkBase { - public MyDoubleSink() { - super(new DoubleArraySource(), null, DoubleChunk::chunkWrap); - } - - @Override - protected void nullFlagsToValues(final double[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii != size; ++ii) { - if (isNull[ii]) { - values[ii] = QueryConstants.NULL_DOUBLE; - } - } - } - } - - private static final class MyStringSink extends MySinkBase { - public MyStringSink() { - super(new ObjectArraySource<>(String.class), null, ObjectChunk::chunkWrap); - } - - @Override - protected void nullFlagsToValues(final String[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii != size; ++ii) { - if (isNull[ii]) { - values[ii] = null; - } - } - } - } - - private static final class MyDateTimeAsLongSink extends MySinkBase { - public MyDateTimeAsLongSink() { - super(new DateTimeArraySource(), long.class, LongChunk::chunkWrap); - } - - @Override - protected void nullFlagsToValues(final long[] values, final boolean[] isNull, final int size) { - for (int ii = 0; ii != size; ++ii) { - if (isNull[ii]) { - values[ii] = QueryConstants.NULL_LONG; - } - } - } - } - - private static SinkFactory makeMySinkFactory() { - return SinkFactory.of( - MyByteSink::new, QueryConstants.NULL_BYTE_BOXED, - MyShortSink::new, QueryConstants.NULL_SHORT_BOXED, - MyIntSink::new, QueryConstants.NULL_INT_BOXED, - MyLongSink::new, QueryConstants.NULL_LONG_BOXED, - MyFloatSink::new, QueryConstants.NULL_FLOAT_BOXED, - MyDoubleSink::new, QueryConstants.NULL_DOUBLE_BOXED, - MyBooleanAsByteSink::new, - MyCharSink::new, QueryConstants.NULL_CHAR, - MyStringSink::new, null, - MyDateTimeAsLongSink::new, QueryConstants.NULL_LONG, - MyDateTimeAsLongSink::new, QueryConstants.NULL_LONG); - } - - private static Parser typeToParser(Type type) { - return type.walk(new MyVisitor()).out; - } - - private static final class MyVisitor implements Type.Visitor, PrimitiveType.Visitor, GenericType.Visitor { - private Parser out; - - @Override - public void visit(PrimitiveType primitiveType) { - primitiveType.walk((PrimitiveType.Visitor) this); - } - - @Override - public void visit(GenericType genericType) { - genericType.walk((GenericType.Visitor) this); - } - - @Override - public void visit(BooleanType booleanType) { - out = Parsers.BOOLEAN; - } - - @Override - public void visit(ByteType byteType) { - out = Parsers.BYTE; - } - - @Override - public void visit(CharType charType) { - out = Parsers.CHAR; - } - - @Override - public void visit(ShortType shortType) { - out = Parsers.SHORT; - } - - @Override - public void visit(IntType intType) { - out = Parsers.INT; - } - - @Override - public void visit(LongType longType) { - out = Parsers.LONG; - } - - @Override - public void visit(FloatType floatType) { - out = Parsers.FLOAT_FAST; - } - - @Override - public void visit(DoubleType doubleType) { - out = Parsers.DOUBLE; - } - - @Override - public void visit(StringType stringType) { - out = Parsers.STRING; - } - - @Override - public void visit(InstantType instantType) { - throw new RuntimeException("Logic error: there is no Parser for " + instantType); - } - - @Override - public void visit(ArrayType arrayType) { - throw new RuntimeException("Logic error: there is no Parser for " + arrayType); - } - - @Override - public void visit(CustomType customType) { - throw new RuntimeException("Logic error: there is no Parser for " + customType); - } - } - - /** - * A class that aids in Deephaven TimeZone parsing. In particular it memorizes the set of known Deephaven - * DateTimeZones and keeps them in a hashmap for fast lookup. It also remembers the last timezone looked up for even - * faster access. It is used as a callback for the Tokenizer class. - */ - private static final class TimeZoneParser implements Tokenizer.CustomTimeZoneParser { - private static final String DEEPHAVEN_TZ_PREFIX = "TZ_"; - private static final int MAX_DEEPHAVEN_TZ_LENGTH = 3; - - private final TIntObjectHashMap zoneIdMap = new TIntObjectHashMap<>(); - - private int lastTzKey = -1; - private ZoneId lastZoneId = null; - - public TimeZoneParser() { - for (TimeZone zone : TimeZone.values()) { - final String zname = zone.name(); - if (!zname.startsWith(DEEPHAVEN_TZ_PREFIX)) { - throw new RuntimeException("Logic error: unexpected enum in DBTimeZone: " + zname); - } - final String zSuffix = zname.substring(DEEPHAVEN_TZ_PREFIX.length()); - final int zlen = zSuffix.length(); - if (zlen > MAX_DEEPHAVEN_TZ_LENGTH) { - throw new RuntimeException("Logic error: unexpectedly-long enum in DBTimeZone: " + zname); - } - final byte[] data = new byte[zlen]; - for (int ii = 0; ii < zlen; ++ii) { - final char ch = zSuffix.charAt(ii); - if (!RangeTests.isUpper(ch)) { - throw new RuntimeException("Logic error: unexpected character in DBTimeZone name: " + zname); - } - data[ii] = (byte) ch; - } - final ByteSlice bs = new ByteSlice(data, 0, data.length); - final int tzKey = tryParseTzKey(bs); - if (tzKey < 0) { - throw new RuntimeException("Logic error: can't parse DBTimeZone as key: " + zname); - } - final ZoneId zoneId = zone.getTimeZone().toTimeZone().toZoneId(); - zoneIdMap.put(tzKey, zoneId); - } - } - - @Override - public boolean tryParse(ByteSlice bs, MutableObject zoneId, MutableLong offsetSeconds) { - if (bs.size() == 0 || bs.front() != ' ') { - return false; - } - final int savedBegin = bs.begin(); - bs.setBegin(bs.begin() + 1); - final int tzKey = tryParseTzKey(bs); - if (tzKey < 0) { - bs.setBegin(savedBegin); - return false; - } - if (tzKey != lastTzKey) { - final ZoneId res = zoneIdMap.get(tzKey); - if (res == null) { - bs.setBegin(savedBegin); - return false; - } - lastTzKey = tzKey; - lastZoneId = res; - } - zoneId.setValue(lastZoneId); - offsetSeconds.setValue(0); - return true; - } - - /** - * Take up to three uppercase characters from a TimeZone string and pack them into an integer. - * - * @param bs A ByteSlice holding the timezone key. - * @return The characters packed into an int, or -1 if there are too many or too few characters, or if the - * characters are not uppercase ASCII. - */ - private static int tryParseTzKey(final ByteSlice bs) { - int res = 0; - int current; - for (current = bs.begin(); current != bs.end(); ++current) { - if (current - bs.begin() > MAX_DEEPHAVEN_TZ_LENGTH) { - return -1; - } - final char ch = RangeTests.toUpper((char) bs.data()[current]); - if (!RangeTests.isUpper(ch)) { - // If it's some nonalphabetic character - break; - } - res = res * 26 + (ch - 'A'); - } - if (current - bs.begin() == 0) { - return -1; - } - bs.setBegin(current); - return res; - } - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java b/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java index 0219b27e3ce..422a474f1b0 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java @@ -5,31 +5,79 @@ package io.deephaven.csv; import io.deephaven.base.Procedure; +import io.deephaven.chunk.ByteChunk; +import io.deephaven.chunk.CharChunk; +import io.deephaven.chunk.Chunk; +import io.deephaven.chunk.DoubleChunk; +import io.deephaven.chunk.FloatChunk; +import io.deephaven.chunk.IntChunk; +import io.deephaven.chunk.LongChunk; +import io.deephaven.chunk.ObjectChunk; +import io.deephaven.chunk.ShortChunk; +import io.deephaven.chunk.WritableByteChunk; +import io.deephaven.chunk.WritableChunk; +import io.deephaven.chunk.WritableIntChunk; +import io.deephaven.chunk.WritableLongChunk; +import io.deephaven.chunk.WritableShortChunk; +import io.deephaven.chunk.attributes.Values; import io.deephaven.csv.reading.CsvReader; +import io.deephaven.csv.sinks.Sink; +import io.deephaven.csv.sinks.SinkFactory; +import io.deephaven.csv.sinks.Source; import io.deephaven.csv.util.CsvReaderException; import io.deephaven.datastructures.util.CollectionUtil; +import io.deephaven.engine.rowset.RowSequence; +import io.deephaven.engine.rowset.RowSequenceFactory; +import io.deephaven.engine.rowset.RowSetFactory; +import io.deephaven.engine.rowset.TrackingRowSet; +import io.deephaven.engine.table.ChunkSink; +import io.deephaven.engine.table.ColumnSource; import io.deephaven.engine.table.DataColumn; import io.deephaven.engine.table.MatchPair; import io.deephaven.engine.table.Table; +import io.deephaven.engine.table.TableDefinition; +import io.deephaven.engine.table.WritableColumnSource; import io.deephaven.engine.table.impl.InMemoryTable; import io.deephaven.engine.table.impl.perf.QueryPerformanceNugget; import io.deephaven.engine.table.impl.perf.QueryPerformanceRecorder; -import io.deephaven.time.DateTime; -import io.deephaven.time.TimeZone; +import io.deephaven.engine.table.impl.sources.ArrayBackedColumnSource; +import io.deephaven.engine.table.impl.sources.BooleanArraySource; +import io.deephaven.engine.table.impl.sources.ByteArraySource; +import io.deephaven.engine.table.impl.sources.CharacterArraySource; +import io.deephaven.engine.table.impl.sources.DateTimeArraySource; +import io.deephaven.engine.table.impl.sources.DoubleArraySource; +import io.deephaven.engine.table.impl.sources.FloatArraySource; +import io.deephaven.engine.table.impl.sources.IntegerArraySource; +import io.deephaven.engine.table.impl.sources.LongArraySource; +import io.deephaven.engine.table.impl.sources.ObjectArraySource; +import io.deephaven.engine.table.impl.sources.ShortArraySource; import io.deephaven.engine.util.PathUtil; import io.deephaven.engine.util.TableTools; import io.deephaven.io.streams.BzipFileOutputStream; +import io.deephaven.time.DateTime; +import io.deephaven.time.TimeZone; +import io.deephaven.util.BooleanUtils; +import io.deephaven.util.QueryConstants; import io.deephaven.util.annotations.ScriptApi; import org.jetbrains.annotations.Nullable; -import java.io.*; +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStreamWriter; +import java.io.PrintStream; +import java.io.PrintWriter; +import java.io.Writer; import java.net.MalformedURLException; import java.net.URL; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; import java.util.Collection; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; /** * Utilities for reading and writing CSV files to and from {@link Table}s @@ -151,7 +199,20 @@ public static Table readCsv(String path, CsvSpecs specs) throws CsvReaderExcepti */ @ScriptApi public static Table readCsv(InputStream stream, CsvSpecs specs) throws CsvReaderException { - return specs.parse(stream); + final CsvReader.Result result = CsvReader.read(specs, stream, makeMySinkFactory()); + final String[] columnNames = result.columnNames(); + final Sink[] sinks = result.columns(); + final Map> columns = new LinkedHashMap<>(); + long maxSize = 0; + for (int ii = 0; ii < columnNames.length; ++ii) { + final String columnName = columnNames[ii]; + final MySinkBase sink = (MySinkBase) sinks[ii]; + maxSize = Math.max(maxSize, sink.resultSize()); + columns.put(columnName, sink.result()); + } + final TableDefinition tableDef = TableDefinition.inferFrom(columns); + final TrackingRowSet rowSet = RowSetFactory.flat(maxSize).toTracking(); + return InMemoryTable.from(tableDef, rowSet, columns); } /** @@ -166,7 +227,7 @@ public static Table readCsv(InputStream stream, CsvSpecs specs) throws CsvReader @ScriptApi public static Table readCsv(URL url, CsvSpecs specs) throws CsvReaderException { try { - return specs.parse(url.openStream()); + return readCsv(url.openStream(), specs); } catch (IOException inner) { throw new CsvReaderException("Caught exception", inner); } @@ -189,7 +250,7 @@ public static Table readCsv(URL url, CsvSpecs specs) throws CsvReaderException { @ScriptApi public static Table readCsv(Path path, CsvSpecs specs) throws CsvReaderException { try { - return specs.parse(PathUtil.open(path)); + return readCsv(PathUtil.open(path), specs); } catch (IOException inner) { throw new CsvReaderException("Caught exception", inner); } @@ -256,7 +317,7 @@ public static Table readHeaderlessCsv(String filePath, String... columnNames) th @ScriptApi @Deprecated public static Table readCsv(InputStream is, final String format) throws CsvReaderException { - final CsvSpecs specs = CsvSpecs.fromLegacyFormat(format); + final CsvSpecs specs = fromLegacyFormat(format); if (specs == null) { throw new IllegalArgumentException(String.format("Unable to map legacy format '%s' into CsvSpecs", format)); } @@ -276,7 +337,7 @@ public static Table readCsv(InputStream is, final String format) throws CsvReade @ScriptApi @Deprecated public static Table readCsv(InputStream is, final char separator) throws CsvReaderException { - return CsvSpecs.builder().delimiter(separator).build().parse(is); + return readCsv(is, CsvSpecs.builder().delimiter(separator).build()); } private static boolean isStandardFile(URL url) { @@ -876,4 +937,291 @@ private static void writeCsvContentsSeq( nugget.done(); } } + + public static CsvSpecs fromLegacyFormat(String format) { + if (format == null) { + return CsvSpecs.csv(); + } else if (format.length() == 1) { + return CsvSpecs.builder().delimiter(format.charAt(0)).build(); + } else if ("TRIM".equals(format)) { + return CsvSpecs.builder().trim(true).build(); + } else if ("DEFAULT".equals(format)) { + return CsvSpecs.builder().ignoreSurroundingSpaces(false).build(); + } else if ("TDF".equals(format)) { + return CsvSpecs.tsv(); + } + return null; + } + + private static abstract class MySinkBase implements Sink { + protected final ArrayBackedColumnSource result; + protected long resultSize; + protected final WritableColumnSource reinterpreted; + protected final ChunkWrapInvoker> chunkWrapInvoker; + + public MySinkBase(ArrayBackedColumnSource result, Class interpClass, + ChunkWrapInvoker> chunkWrapInvoker) { + this.result = result; + this.resultSize = 0; + if (interpClass != null) { + reinterpreted = (WritableColumnSource) result.reinterpret(interpClass); + } else { + reinterpreted = result; + } + this.chunkWrapInvoker = chunkWrapInvoker; + } + + @Override + public final void write(final TARRAY src, final boolean[] isNull, final long destBegin, final long destEnd, + boolean appending_unused) { + if (destBegin == destEnd) { + return; + } + final int size = Math.toIntExact(destEnd - destBegin); + nullFlagsToValues(src, isNull, size); + reinterpreted.ensureCapacity(destEnd); + resultSize = Math.max(resultSize, destEnd); + try (final ChunkSink.FillFromContext context = reinterpreted.makeFillFromContext(size); + final RowSequence range = RowSequenceFactory.forRange(destBegin, destEnd - 1)) { + Chunk chunk = chunkWrapInvoker.apply(src, 0, size); + reinterpreted.fillFromChunk(context, chunk, range); + } + } + + protected abstract void nullFlagsToValues(final TARRAY values, final boolean[] isNull, final int size); + + public ArrayBackedColumnSource result() { + return result; + } + + public long resultSize() { + return resultSize; + } + + protected interface ChunkWrapInvoker { + TRESULT apply(final TARRAY data, final int offset, final int capacity); + } + } + + private static abstract class MySourceAndSinkBase extends MySinkBase + implements Source, Sink { + private final ChunkWrapInvoker> writableChunkWrapInvoker; + + public MySourceAndSinkBase(ArrayBackedColumnSource result, Class interpClass, + ChunkWrapInvoker> chunkWrapInvoker, + ChunkWrapInvoker> writeableChunkWrapInvoker) { + super(result, interpClass, chunkWrapInvoker); + this.writableChunkWrapInvoker = writeableChunkWrapInvoker; + } + + @Override + public final void read(TARRAY dest, boolean[] isNull, long srcBegin, long srcEnd) { + if (srcBegin == srcEnd) { + return; + } + final int size = Math.toIntExact(srcEnd - srcBegin); + try (final ChunkSink.FillContext context = reinterpreted.makeFillContext(size); + final RowSequence range = RowSequenceFactory.forRange(srcBegin, srcEnd - 1)) { + WritableChunk chunk = writableChunkWrapInvoker.apply(dest, 0, size); + reinterpreted.fillChunk(context, chunk, range); + } + valuesToNullFlags(dest, isNull, size); + } + + protected abstract void valuesToNullFlags(final TARRAY values, final boolean[] isNull, final int size); + } + + private static final class MyCharSink extends MySinkBase { + public MyCharSink() { + super(new CharacterArraySource(), null, CharChunk::chunkWrap); + } + + @Override + protected void nullFlagsToValues(final char[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii < size; ++ii) { + if (isNull[ii]) { + values[ii] = QueryConstants.NULL_CHAR; + } + } + } + } + + private static final class MyBooleanAsByteSink extends MySinkBase { + public MyBooleanAsByteSink() { + super(new BooleanArraySource(), byte.class, ByteChunk::chunkWrap); + } + + @Override + protected void nullFlagsToValues(final byte[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii < size; ++ii) { + if (isNull[ii]) { + values[ii] = BooleanUtils.NULL_BOOLEAN_AS_BYTE; + } + } + } + } + + private static final class MyByteSink extends MySourceAndSinkBase { + public MyByteSink() { + super(new ByteArraySource(), null, ByteChunk::chunkWrap, WritableByteChunk::writableChunkWrap); + } + + @Override + protected void nullFlagsToValues(final byte[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii != size; ++ii) { + if (isNull[ii]) { + values[ii] = QueryConstants.NULL_BYTE; + } + } + } + + @Override + protected void valuesToNullFlags(final byte[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii < size; ++ii) { + isNull[ii] = values[ii] == QueryConstants.NULL_BYTE; + } + } + } + + private static final class MyShortSink extends MySourceAndSinkBase { + public MyShortSink() { + super(new ShortArraySource(), null, ShortChunk::chunkWrap, WritableShortChunk::writableChunkWrap); + } + + @Override + protected void nullFlagsToValues(final short[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii != size; ++ii) { + if (isNull[ii]) { + values[ii] = QueryConstants.NULL_SHORT; + } + } + } + + @Override + protected void valuesToNullFlags(final short[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii < size; ++ii) { + isNull[ii] = values[ii] == QueryConstants.NULL_SHORT; + } + } + } + + private static final class MyIntSink extends MySourceAndSinkBase { + public MyIntSink() { + super(new IntegerArraySource(), null, IntChunk::chunkWrap, WritableIntChunk::writableChunkWrap); + } + + @Override + protected void nullFlagsToValues(final int[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii != size; ++ii) { + if (isNull[ii]) { + values[ii] = QueryConstants.NULL_INT; + } + } + } + + @Override + protected void valuesToNullFlags(final int[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii < size; ++ii) { + isNull[ii] = values[ii] == QueryConstants.NULL_INT; + } + } + } + + private static final class MyLongSink extends MySourceAndSinkBase { + public MyLongSink() { + super(new LongArraySource(), null, LongChunk::chunkWrap, WritableLongChunk::writableChunkWrap); + } + + @Override + protected void nullFlagsToValues(final long[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii != size; ++ii) { + if (isNull[ii]) { + values[ii] = QueryConstants.NULL_LONG; + } + } + } + + @Override + protected void valuesToNullFlags(final long[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii < size; ++ii) { + isNull[ii] = values[ii] == QueryConstants.NULL_LONG; + } + } + + } + + private static final class MyFloatSink extends MySinkBase { + public MyFloatSink() { + super(new FloatArraySource(), null, FloatChunk::chunkWrap); + } + + @Override + protected void nullFlagsToValues(final float[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii != size; ++ii) { + if (isNull[ii]) { + values[ii] = QueryConstants.NULL_FLOAT; + } + } + } + } + + private static final class MyDoubleSink extends MySinkBase { + public MyDoubleSink() { + super(new DoubleArraySource(), null, DoubleChunk::chunkWrap); + } + + @Override + protected void nullFlagsToValues(final double[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii != size; ++ii) { + if (isNull[ii]) { + values[ii] = QueryConstants.NULL_DOUBLE; + } + } + } + } + + private static final class MyStringSink extends MySinkBase { + public MyStringSink() { + super(new ObjectArraySource<>(String.class), null, ObjectChunk::chunkWrap); + } + + @Override + protected void nullFlagsToValues(final String[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii != size; ++ii) { + if (isNull[ii]) { + values[ii] = null; + } + } + } + } + + private static final class MyDateTimeAsLongSink extends MySinkBase { + public MyDateTimeAsLongSink() { + super(new DateTimeArraySource(), long.class, LongChunk::chunkWrap); + } + + @Override + protected void nullFlagsToValues(final long[] values, final boolean[] isNull, final int size) { + for (int ii = 0; ii != size; ++ii) { + if (isNull[ii]) { + values[ii] = QueryConstants.NULL_LONG; + } + } + } + } + + private static SinkFactory makeMySinkFactory() { + return SinkFactory.of( + MyByteSink::new, QueryConstants.NULL_BYTE_BOXED, + MyShortSink::new, QueryConstants.NULL_SHORT_BOXED, + MyIntSink::new, QueryConstants.NULL_INT_BOXED, + MyLongSink::new, QueryConstants.NULL_LONG_BOXED, + MyFloatSink::new, QueryConstants.NULL_FLOAT_BOXED, + MyDoubleSink::new, QueryConstants.NULL_DOUBLE_BOXED, + MyBooleanAsByteSink::new, + MyCharSink::new, QueryConstants.NULL_CHAR, + MyStringSink::new, null, + MyDateTimeAsLongSink::new, QueryConstants.NULL_LONG, + MyDateTimeAsLongSink::new, QueryConstants.NULL_LONG); + } } diff --git a/extensions/csv/src/main/java/io/deephaven/csv/InferenceSpecs.java b/extensions/csv/src/main/java/io/deephaven/csv/InferenceSpecs.java deleted file mode 100644 index 5a5452bf113..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/InferenceSpecs.java +++ /dev/null @@ -1,136 +0,0 @@ -package io.deephaven.csv; - -import io.deephaven.annotations.BuildableStyle; -import io.deephaven.csv.parsers.Parser; -import io.deephaven.csv.parsers.Parsers; -import org.immutables.value.Value.Default; -import org.immutables.value.Value.Immutable; -import org.jetbrains.annotations.Nullable; - -import java.util.*; - -/** - * Inference specifications contains the configuration and logic for inferring an acceptable parser from string values. - * - * @see #infer(Iterator) - */ -@Immutable -@BuildableStyle -public abstract class InferenceSpecs { - public static final List> STRINGS_PARSERS = Parsers.STRINGS; - - public static final List> MINIMAL_PARSERS = Parsers.MINIMAL; - - public static final List> STANDARD_PARSERS = Parsers.DEFAULT; - - public static final List> COMPLETE_PARSERS = Parsers.COMPLETE; - - public static final List> STANDARD_TIMES_PARSERS = Parsers.STANDARD_TIMES; - - public static final List> STANDARD_MILLITIMES_PARSERS = Parsers.STANDARD_MILLITIMES; - - public static final List> STANDARD_MICROTIMES_PARSERS = Parsers.STANDARD_MICROTIMES; - - public static final List> STANDARD_NANOTIMES_PARSERS = Parsers.STANDARD_NANOTIMES; - - /** - * Creates a builder for {@link InferenceSpecs}. - * - * @return the builder - */ - public static Builder builder() { - return ImmutableInferenceSpecs.builder(); - } - - /** - * The string-only inference. - * - * @return the string-only inference - */ - public static InferenceSpecs strings() { - return builder().addAllParsers(STRINGS_PARSERS).build(); - } - - /** - * The "minimal" inference. - */ - public static InferenceSpecs minimal() { - return builder().addAllParsers(MINIMAL_PARSERS).build(); - } - - /** - * The "standard" inference, does not parse bytes, shorts, or floats. - */ - public static InferenceSpecs standard() { - return builder().addAllParsers(STANDARD_PARSERS).build(); - } - - /** - * The "complete" inference. - */ - public static InferenceSpecs complete() { - return builder().addAllParsers(COMPLETE_PARSERS).build(); - } - - /** - * The standard parsers with additional {@link java.time.Instant}-based parsing. - * - * @return the standard times inference - */ - public static InferenceSpecs standardTimes() { - return builder().addAllParsers(STANDARD_TIMES_PARSERS).build(); - } - - public static InferenceSpecs milliTimes() { - return builder().addAllParsers(STANDARD_MILLITIMES_PARSERS).build(); - } - - public static InferenceSpecs microTimes() { - return builder().addAllParsers(STANDARD_MICROTIMES_PARSERS).build(); - } - - public static InferenceSpecs nanoTimes() { - return builder().addAllParsers(STANDARD_NANOTIMES_PARSERS).build(); - } - - /** - * The parsers that the user wants to participate in type inference. Note that the order that the parsers in this - * list matters only for custom parsers. In particular: - *

    - *
  1. Standard system parsers (singletons from the {@link Parsers} class) will run in their standard precedence - * order, regardless of the order they appear here.
  2. - *
  3. All specified system parsers will be run before any specified custom parsers.
  4. - *
  5. Custom parsers will be run in the order they are specified here.
  6. - *
- * - * @return the parsers - */ - public abstract List> parsers(); - - /** - * The parser to return when all values are null. May be {@code null}. - * - *

- * By default, returns a {@link Parsers#STRING}. - * - * @return the on-null values parser - */ - @Default - @Nullable - public Parser nullParser() { - return Parsers.STRING; - } - - public interface Builder { - - Builder nullParser(Parser parser); - - Builder addParsers(Parser item); - - Builder addParsers(Parser... items); - - Builder addAllParsers(Iterable> items); - - InferenceSpecs build(); - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/containers/ByteSlice.java b/extensions/csv/src/main/java/io/deephaven/csv/containers/ByteSlice.java deleted file mode 100644 index 1aaa3857ee4..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/containers/ByteSlice.java +++ /dev/null @@ -1,109 +0,0 @@ -package io.deephaven.csv.containers; - -/** - * An object that represents a slice of byte data. This object is intended to be reusable. - */ -public final class ByteSlice { - /** - * The underlying data. - */ - private byte[] data; - /** - * The index of the first data element. - */ - private int begin; - /** - * The index that is one past the last data element. - */ - private int end; - - /** - * Make an empty ByteSlice with a null underlying array. - */ - public ByteSlice() {} - - /** - * Constructs a ByteSlice from the half-open interval [{@code begin}, {@code end}) of the array {@code data}. - */ - public ByteSlice(final byte[] data, final int begin, final int end) { - reset(data, begin, end); - } - - /** - * Reset the ByteSlice to the half-open interval [{@code begin}, {@code end}) of the array {@code data}. - */ - public void reset(final byte[] data, final int begin, final int end) { - this.data = data; - this.begin = begin; - this.end = end; - } - - /** - * Copies the slice to the destination array, starting at the specified destination position. - */ - public void copyTo(byte[] dest, int destOffset) { - System.arraycopy(data, begin, dest, destOffset, end - begin); - } - - /** - * Gets the 'begin' field of the slice - */ - public int begin() { - return begin; - } - - /** - * Gets the 'end' field of the slice. - */ - public int end() { - return end; - } - - /** - * Sets the 'begin' field of the slice. - */ - public void setBegin(int begin) { - this.begin = begin; - } - - /** - * Sets the 'end' field of the slice. - */ - public void setEnd(int end) { - this.end = end; - } - - /** - * Gets the first character of the slice. The behavior is unspecified if the slice is empty. - */ - public byte front() { - return data[begin]; - } - - /** - * Gets the last character of the slice. The behavior is unspecified if the slice is empty. - */ - public byte back() { - return data[end - 1]; - } - - /** - * Gets the underlying array from the slice. - */ - public byte[] data() { - return data; - } - - /** - * Gets the size of the slice. - */ - public int size() { - return end - begin; - } - - @Override - public String toString() { - final int size = end - begin; - return size == 0 ? "" : new String(data, begin, end - begin); - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/containers/GrowableByteBuffer.java b/extensions/csv/src/main/java/io/deephaven/csv/containers/GrowableByteBuffer.java deleted file mode 100644 index d700e5f0d46..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/containers/GrowableByteBuffer.java +++ /dev/null @@ -1,72 +0,0 @@ -package io.deephaven.csv.containers; - -/** - * This is like TByteArrayList except that you can get at the underlying data buffer and use it for your own purposes, - * assuming you know what you're doing. We exploit this ability to (temporarily) point our slices at the underlying - * array while we are processing slices. In terms of expected usage, this class is only used for holding the data for - * cells, and only when the cell has escaped characters (like escaped quotes) or the cell spans more than one input - * chunk (in which case we can no longer do the trick where we point a slice directly at the array that buffers our - * input). Therefore the max size of this data structure is equal to the size of the largest cell in the input (which - * likely to be in the 10s or 100s of bytes). Since it's expected to be modest in size, we don't worry too much about - * our growth strategy, which simply involves doubling when we run out of space. In fact, in practice for "normal" - * input, this object probably never reallocates. - */ -public final class GrowableByteBuffer { - private static final int INITIAL_BUFFER_SIZE = 1024; - - /** - * Underlying buffer. Grows as needed. - */ - private byte[] data = new byte[INITIAL_BUFFER_SIZE]; - /** - * Current size of the data. - */ - private int size = 0; - - /** - * Appends 'srcSize' characters from 'src', starting at 'srcOffset'. - */ - public void append(byte[] src, int srcOffset, int srcSize) { - ensure(srcSize); - System.arraycopy(src, srcOffset, data, size, srcSize); - size += srcSize; - } - - /** - * Ensure that the buffer can hold at least 'additionalSize' items. - */ - private void ensure(int additionalSize) { - final int sizeNeeded = Math.addExact(size, additionalSize); - if (sizeNeeded <= data.length) { - return; - } - - // Ensuring that we always at least double the buffer, but we may not always - // follow powers of two - final int newSize = Math.max(sizeNeeded, Math.multiplyExact(size, 2)); - final byte[] newData = new byte[newSize]; - System.arraycopy(data, 0, newData, 0, size); - data = newData; - } - - /** - * Clear the buffer. - */ - public void clear() { - size = 0; - } - - /** - * Access the underlying data array. - */ - public byte[] data() { - return data; - } - - /** - * The current size. - */ - public int size() { - return size; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageConstants.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageConstants.java deleted file mode 100644 index 68d33cf3760..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageConstants.java +++ /dev/null @@ -1,34 +0,0 @@ -package io.deephaven.csv.densestorage; - -/** - * Constants that control the behavior of the {@link DenseStorageWriter} and {@link DenseStorageReader}. - */ -public class DenseStorageConstants { - /** - * When input strings are less than this threshold, we pack them tightly into a chunk. When they are greater than or - * equal to this threshold, we allocate them directly as their own individual byte arrays. - */ - public static final int LARGE_THRESHOLD = 1024; - /** - * Size of the "control queue" blocks. Somewhat arbitrary but should be large-ish. We have arbitrarily chosen - * 100,000 here. - */ - public static final int CONTROL_QUEUE_SIZE = 100_000; - /** - * Size of the "packed" byte blocks. The number chosen in somewhat arbitrary but it should be large-ish (100K? 1M?) - * for performance and a decent multiple of LARGE_THRESHOLD to avoid wasting too much space at the end of each - * block. By making it 1024x the size of LARGE_THRESHOLD, we can show that the fraction of wasted space at the end - * of each block can never be more than (1/1024). - */ - public static final int PACKED_QUEUE_SIZE = LARGE_THRESHOLD * 1024; - /** - * Size of the "array queue". Somewhat arbitrary but should be large-ish. We have arbitrarily chosen 100K here. 10K - * might also be reasonable. - */ - public static final int ARRAY_QUEUE_SIZE = 100_000; - /** - * This sentinel value is used to indicate that the next value being read is not bytes packed into a byte block but - * rather its own byte array. - */ - public static final int LARGE_BYTE_ARRAY_SENTINEL = -1; -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageReader.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageReader.java deleted file mode 100644 index d1cd2008327..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageReader.java +++ /dev/null @@ -1,69 +0,0 @@ - -package io.deephaven.csv.densestorage; - -import io.deephaven.csv.containers.ByteSlice; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableInt; - -/** - * Companion to the {@link DenseStorageWriter}. See the documentation there for details. - */ -public final class DenseStorageReader { - /** - * Byte sequences < DENSE_THRESHOLD are compactly stored here - */ - private final QueueReader.ByteReader byteReader; - /** - * Byte sequences >= DENSE_THRESHOLD are stored here - */ - private final QueueReader.ByteArrayReader largeByteArrayReader; - /** - * Control bytes (lengths, negated lengths, or sentinels). See DenseStorageWriter. - */ - private final QueueReader.IntReader controlReader; - /** - * For the "out" parameter of controlReader.tryGetInt() - */ - private final MutableInt intHolder = new MutableInt(); - - /** - * Constructor. - */ - public DenseStorageReader(final QueueReader.IntReader controlReader, - final QueueReader.ByteReader byteReader, - final QueueReader.ByteArrayReader largeByteArrayReader) { - this.controlReader = controlReader; - this.byteReader = byteReader; - this.largeByteArrayReader = largeByteArrayReader; - } - - /** - * Tries to get the next slice from one of the inner QueueReaders. Uses data in the 'controlReader' to figure out - * which QueueReader the next slice is coming from. - * - * @param bs If the method returns true, the contents of this parameter will be updated. - * @return true if there is more data, and the ByteSlice has been populated. Otherwise, false. - */ - public boolean tryGetNextSlice(final ByteSlice bs) - throws CsvReaderException { - if (!controlReader.tryGetInt(intHolder)) { - return false; - } - final int control = intHolder.intValue(); - if (control == DenseStorageConstants.LARGE_BYTE_ARRAY_SENTINEL) { - mustSucceed(largeByteArrayReader.tryGetBytes(bs), "largeByteArrayReader"); - return true; - } - mustSucceed(byteReader.tryGetBytes(control, bs), "byteReader"); - return true; - } - - /** - * Convenience method that throws an exception if "success" is false. - */ - private static void mustSucceed(final boolean success, final String what) throws CsvReaderException { - if (!success) { - throw new CsvReaderException("Data unexpectedly exhausted: " + what); - } - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageWriter.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageWriter.java deleted file mode 100644 index 92de6f766f9..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/DenseStorageWriter.java +++ /dev/null @@ -1,147 +0,0 @@ -package io.deephaven.csv.densestorage; - -import io.deephaven.csv.containers.ByteSlice; -import io.deephaven.csv.tokenization.RangeTests; - -/** - * The DenseStorageWriter and {@link DenseStorageReader} work in tandem, forming a FIFO queue. The DenseStorageWriter - * writes data, and the {@link DenseStorageReader} reads that data. If the {@link DenseStorageReader} "catches up", it - * will block until the DenseStorageWriter provides more data, or indicates that it is done (via the {@link #finish()} - * method. This synchronization is done at "block" granularity, so the DenseStorageReader can only proceed when the - * DenseStorageWriter has written at least a "block" of data or is done. We allow multiple independent - * {@link DenseStorageReader}s to consume the same underlying data. In our implementation this is used so our type - * inferencer can take a second "pass" over the same input data. - * - *

- * The point of this object is to store a sequence of (character sequences aka "strings", but not java.lang.String), - * using a small fraction of overhead. The problem with storing every character sequence as a java.lang.String is: - *

    - *
  1. Per-object overhead (probably 8 or 16 bytes depending on pointer width)
  2. - *
  3. The memory cost of holding a reference to that String (again 4 or 8 bytes)
  4. - *
  5. The string has to know its length (4 bytes)
  6. - *
  7. Java characters are 2 bytes even though in practice many strings are ASCII-only and their chars can fit in a - * byte. (Newer Java implementations can store text as bytes, eliminating this objection)
  8. - *
- * - *

- * For small strings (say the word "hello" or the input text "12345.6789") the overhead can be 100% or worse. - * - * For our purposes we: - *

    - *
  1. Only need sequential access. i.e. we don't need random access into the sequence of "strings". So we can support a - * model where we can have a forward-only cursor moving over the sequence of "strings".
  2. - *
  3. Don't need to give our caller a data structure that they can hold on to. The caller only gets a "view" (a slice) - * of the current "string" data. The view is invalidated when they move to the next "string"
  4. - *
- * - * Furthermore we: - *
    - *
  1. Offer a FIFO model where the reader (in a separate thread) can chase the writer but there is not an inordinate - * amount of synchronization overhead (synchronization happens at the block level, not the "string" level).
  2. - *
  3. Have the ability to make multiple Readers which pass over the same underlying data. This is our low-drama way of - * allowing our client to make multiple passes over the data, without complicating the iteration interface, with, e.g., - * a reset method.
  4. - *
  5. Use a linked-list structure so that when all existing readers have move passed a block of data, that block can be - * freed by the garbage collector without any explicit action taken by the reader.
  6. - *
- * - * If you are familiar with the structure of our inference, you may initially think that this reader-chasing-writer - * garbage collection trick doesn't buy us much because we have a two-phase parser. However, when the inferencer has - * gotten to the last parser in its set of allowable parsers (say, the String parser), or the user has specified that - * there is only one parser for this column, then the code doesn't need to do any inference and can parse the column in - * one pass. In this case, when the reader stays caught up with the writer, we are basically just buffering one block of - * data, not the whole file. - * - *

- * The implementation used here is to look at the "string" being added to the writer and categorize it along two - * dimensions: - *

    - *
  • Small vs large
  • - *
  • Byte vs char
  • - *
- * - * These dimensions are broken out in the following way: - *
  • Small byte "strings" are packed into a byte block, and we maintain a linked list of these byte blocks.
  • - *
  • "Large" byte "strings" are stored directly, meaning a byte[] array is allocated for their data, then a reference - * to that array is added to a byte-array block. (And again, we maintain a linked list of these byte-array blocks). It - * is not typical for CSV data to contain a cell this large, but the feature is there for completeness. We do not want - * want large "strings" to contaminate our packed byte blocks because they would not likely pack into them tightly (it - * would become more likely to have allocated blocks with unused storage at the end, because the last big string - * wouldn't fit in the current block). It's OK to keep them on their own because by definition, large "strings" are not - * going to have much overhead, as a percentage of the size of their text content.
  • - * - */ -public final class DenseStorageWriter { - /** - * The ints in this array indicate where the next item is stored: - *
      - *
    • {@link DenseStorageConstants#LARGE_BYTE_ARRAY_SENTINEL}: - * {@link DenseStorageWriter#largeByteArrayWriter}.
    • - *
    • > 0: {@link DenseStorageWriter#byteWriter} (the number of chars is equal to this value)
    • - *
    • == 0: no bytes, so they're not stored anywhere. Will be interpreted as a ByteSlice with arbitrary byte data - * and length 0.
    • - *
    - */ - private final QueueWriter.IntWriter controlWriter; - /** - * Byte sequences < DENSE_THRESHOLD are compactly stored here - */ - private final QueueWriter.ByteWriter byteWriter; - /** - * Byte sequences >= DENSE_THRESHOLD are stored here - */ - private final QueueWriter.ByteArrayWriter largeByteArrayWriter; - - /** - * Constructor - */ - public DenseStorageWriter() { - this.controlWriter = new QueueWriter.IntWriter(DenseStorageConstants.CONTROL_QUEUE_SIZE); - this.byteWriter = new QueueWriter.ByteWriter(DenseStorageConstants.PACKED_QUEUE_SIZE); - this.largeByteArrayWriter = new QueueWriter.ByteArrayWriter(DenseStorageConstants.ARRAY_QUEUE_SIZE); - } - - public DenseStorageReader newReader() { - return new DenseStorageReader( - controlWriter.newReader(), - byteWriter.newReader(), - largeByteArrayWriter.newReader()); - } - - /** - * Append a {@link ByteSlice} to the queue. The data will be diverted to one of the two specialized underlying - * queues, depending on its size. - */ - public void append(final ByteSlice bs) { - final boolean fctrl; - final int size = bs.size(); - if (size >= DenseStorageConstants.LARGE_THRESHOLD) { - final byte[] data = new byte[size]; - bs.copyTo(data, 0); - largeByteArrayWriter.addByteArray(data); - fctrl = controlWriter.addInt(DenseStorageConstants.LARGE_BYTE_ARRAY_SENTINEL); - } else { - byteWriter.addBytes(bs); - fctrl = controlWriter.addInt(size); - } - // If the control queue flushed, then flush all the data queues, so the reader doesn't block for a long time - // waiting for some unflushed data queue. One might worry this this is inefficient, but (a) it doesn't happen - // very often and (b) in our queue code, partially-filled blocks can share non-overlapping parts of their - // large underlying data array, so it's not too wasteful. Put another way, flushing an empty queue does nothing; - // flushing a partially-filled queue allocates a new QueueNode but not a new underlying data array; - // flushing a full queue will allocates a new QueueNode and (at the next write) a new underlying data array. - if (fctrl) { - byteWriter.flush(); - largeByteArrayWriter.flush(); - } - } - - /** - * Call this method to indicate when you are finished writing to the queue. - */ - public void finish() { - controlWriter.finish(); - byteWriter.finish(); - largeByteArrayWriter.finish(); - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueNode.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueNode.java deleted file mode 100644 index def822d5d83..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueNode.java +++ /dev/null @@ -1,28 +0,0 @@ -package io.deephaven.csv.densestorage; - -/** - * Linked list node that holds data for a {@link DenseStorageWriter} or {@link DenseStorageReader}. All fields are - * immutable except the "next" field. Synchronization for reading/writing the "next" field is managed by the - * {@link DenseStorageWriter} and {@link DenseStorageReader}. - */ -public final class QueueNode { - public final TARRAY data; - public final int begin; - public final int end; - public final boolean isLast; - /** - * Readers and writers of this field have arranged to synchronize with each other. - */ - public QueueNode next; - - /** - * Constructor. Sets this queue node to represent the half-open interval ['begin','end') of the array 'data'. - */ - public QueueNode(TARRAY data, int begin, int end, boolean isLast) { - this.data = data; - this.begin = begin; - this.end = end; - this.isLast = isLast; - this.next = null; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueReader.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueReader.java deleted file mode 100644 index 3bd8d319dce..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueReader.java +++ /dev/null @@ -1,221 +0,0 @@ -package io.deephaven.csv.densestorage; - -import io.deephaven.csv.containers.ByteSlice; -import org.apache.commons.lang3.mutable.MutableInt; - -/** - * Companion to the {@link QueueWriter}. See the documentation there for details. - */ -public class QueueReader { - /** - * Sync object which synchronizes access to the "next" fields of every node in our linked list. Shared with the - * QueueWriter. - */ - private final Object sync; - /** - * Current node. - */ - private QueueNode node; - /** - * Current block we are reading from, extracted from the current node. - */ - protected TARRAY genericBlock; - /** - * Current offset in the current block. Updated as we read data. When the value reaches "end", then data in this - * block is exhausted. - */ - protected int current; - /** - * "end" offset of the current block. - */ - protected int end; - - /** - * Constructor. - */ - protected QueueReader(Object sync, QueueNode node) { - this.sync = sync; - this.node = node; - this.genericBlock = null; - this.current = 0; - this.end = 0; - } - - /** - * This method exists as a helper method for a subclass' tryGetXXX method. A typical implementation is in - * CharReader: - * - *
    -     * 
    -     * if (current + size > end) {
    -     *     if (!tryRefill(size)) {
    -     *         return false;
    -     *     }
    -     *     typedBlock = genericBlock;
    -     * }
    -     * 
    -     * 
    - * - * The "if" in the caller is actually checking for multiple cases in a single comparison. One is a normal "buffer - * empty, needs to be refilled" case. The other is a bad "something went terribly wrong" case. - * - *
      - *
    • Case 1, The "buffer empty" case. Then current == end, and therefore current + size > end (assuming size > 0, - * which it always is). Therefore, the 'if' inside the tryGetXXX code would evaluate to true, so the tryGetXXX code - * would call this method. Then this method refills the buffer.
    • - * - *
    • Case 2: The buffer is not empty, but A logic error (which can't happen if the code is correct) has caused the - * requested slice to go past the end of the block. Then current < end but current + size > end. Again, the 'if' - * inside the tryGetXXX code would evaluate to true, so the tryGetXXX code would call this method. But then the - * first line of our method detects the past-the-end condition and throws an exception.
    • - * - *
    • Case 3: The "buffer can satisfy the request" case. Then current + size <= end, so the 'if' inside the - * tryGetXXX code would evaluate to false, and the tryGetXXX method doesn't call this method.
    • - *
    - */ - protected boolean tryRefill(int size) { - if (current != end) { - throw new RuntimeException("Logic error: slice straddled block"); - } - while (current == end) { - if (node.isLast) { - // Hygeine. - node = null; - genericBlock = null; - current = 0; - end = 0; - return false; - } - synchronized (sync) { - while (node.next == null) { - catchyWait(sync); - } - node = node.next; - genericBlock = node.data; - current = node.begin; - end = node.end; - } - } - if (end - current < size) { - throw new RuntimeException(String.format("Logic error: got short block: expected at least %d, got %d", - size, end - current)); - } - return true; - } - - /** - * Call Object.wait() but suppress the need to deal with checked InterruptedExceptions. - */ - private static void catchyWait(Object o) { - try { - o.wait(); - } catch (InterruptedException ie) { - throw new RuntimeException("Logic error: thread interrupted: can't happen"); - } - } - - /** - * A QueueReader specialized for bytes. - */ - public static final class ByteReader extends QueueReader { - /** - * Typed version of the current block. Saves us some implicit casting from the generic TARRAY object. This is a - * performance optimization that may not matter. - */ - private byte[] typedBlock; - - /** - * Constructor. - */ - public ByteReader(final Object sync, final QueueNode head) { - super(sync, head); - } - - /** - * Tries to get the next ByteSlice from the reader. - * - * @param size The exact number of chars to place in the slice. - * @param bs The result, modified in place. - * @return true If the next ByteSlice was successfully read; false if the end of input was reached. - */ - public boolean tryGetBytes(final int size, final ByteSlice bs) { - if (current + size > end) { - if (!tryRefill(size)) { - return false; - } - typedBlock = genericBlock; - } - bs.reset(typedBlock, current, current + size); - current += size; - return true; - } - } - - /** - * A QueueReader specialized for ints. - */ - public static final class IntReader extends QueueReader { - /** - * Typed version of the current block. Saves us some implicit casting from the generic TARRAY object. This is a - * performance optimization that may not matter. - */ - private int[] typedBlock; - - /** - * Constructor. - */ - public IntReader(Object sync, QueueNode head) { - super(sync, head); - } - - /** - * Tries to get the next integer from the reader. - * - * @param result If the operation succeeds, contains the next integer. Otherwise, the contents are unspecified. - * @return true if the next value was successfully read; false if the end of input was reached. - */ - public boolean tryGetInt(final MutableInt result) { - if (current == end) { - if (!tryRefill(1)) { - return false; - } - typedBlock = genericBlock; - } - result.setValue(typedBlock[current++]); - return true; - } - } - - /** - * A QueueReader specialized for byte arrays. - */ - public static final class ByteArrayReader extends QueueReader { - /** - * Typed version of the current block. Saves us some implicit casting from the generic TARRAY object. This is a - * performance optimization that may not matter. - */ - private byte[][] typedBlock; - - public ByteArrayReader(final Object sync, final QueueNode head) { - super(sync, head); - } - - /** - * Tries to get the next ByteSlice from the reader. - * - * @param bs The result, modified in place. - * @return true If the next ByteSlice was successfully read; false if the end of input was reached. - */ - public boolean tryGetBytes(final ByteSlice bs) { - if (current == end) { - if (!tryRefill(1)) { - return false; - } - typedBlock = genericBlock; - } - final byte[] data = typedBlock[current++]; - bs.reset(data, 0, data.length); - return true; - } - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueWriter.java b/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueWriter.java deleted file mode 100644 index 25fd26a011a..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/densestorage/QueueWriter.java +++ /dev/null @@ -1,258 +0,0 @@ -package io.deephaven.csv.densestorage; - -import io.deephaven.csv.containers.ByteSlice; - -import java.util.function.BiFunction; -import java.util.function.IntFunction; - -/** - * The various QueueWriters ({@link ByteWriter}, {@link IntWriter}, etc.) work in tandem with their corresponding - * {@link QueueReader}s ({@link QueueReader.ByteReader}, {@link QueueReader.IntReader}, etc), forming a FIFO queue. The - * QueueWriter writes data, and the {@link QueueReader} reads that data. If the {@link QueueReader} "catches up", it - * will block until the QueueWriter provides more data, or indicates that it is done (via the {@link #finish()} method. - * This synchronization is done at "block" granularity, so the {@link QueueReader} can only proceed when the QueueWriter - * has written at least a "block" of data or is done. We allow multiple independent {@link QueueReader}s to consume the - * same underlying data. In our implementation this is used so our type inferencer can take a second "pass" over the - * same input data. - * - * In our implementation the {@link DenseStorageWriter} and {@link DenseStorageReader} are built out of various - * QueueWriters and {@link QueueReader}s. This explains why the semantics of {@link DenseStorageWriter} and - * {@link DenseStorageReader} are similar to those of the underlying QueueWriters and {@link QueueReader}s. - */ -public class QueueWriter { - /** - * Sync object which synchronizes access to the "next" fields of every node in our linked list. Shared with the - * QueueReader. - */ - private final Object sync; - /** - * Tail of the linked list. We append here when we flush. - */ - private QueueNode tail; - /** - * Size of the chunks we allocate that we pack data into. - */ - protected final int blockSize; - /** - * Lambda for allocating arrays for our chunks. - */ - private final IntFunction arrayFactory; - /** - * Lambda to make a QueueReader of the right subtype. - */ - private final BiFunction, TREADER> readerFactory; - /** - * A flag that says whether it's still early enough to allow QueueReader creation. After the writer starts writing, - * they shouldn't be allowed to create any readers. (This is just because we want to keep the semantics simple). - */ - private boolean allowReaderCreation; - /** - * Current block we writing to. When we flush, we will write it to a new linked list node. - */ - private TARRAY genericBlock; - /** - * Start of the current block. This is typically 0, but not always. If the caller does an early flush (before the - * block is filled), you can have multiple linked list nodes sharing different segments of the same underlying block - * storage. - */ - protected int begin; - /** - * Current offset in the current block. Updated as we write data. When the value reaches "end", then data in this - * block is exhausted. - */ - protected int current; - /** - * End of the current block. The same as genericBlock.length. - */ - protected int end; - - /** - * Constructor. - */ - protected QueueWriter(final int blockSize, - final IntFunction arrayFactory, - final BiFunction, TREADER> readerFactory) { - this.sync = new Object(); - // Creating the linked list with a sentinel object makes linked list manipulation code simpler. - this.tail = new QueueNode<>(null, 0, 0, false); - this.blockSize = blockSize; - this.arrayFactory = arrayFactory; - this.readerFactory = readerFactory; - this.allowReaderCreation = true; - this.genericBlock = null; - this.begin = 0; - this.current = 0; - this.end = 0; - } - - /** - * Caller is finished writing. - */ - public void finish() { - flush(true); - genericBlock = null; // hygeine - begin = 0; - current = 0; - end = 0; - } - - /** - * Make a {@link QueueReader} corresponding to this QueueWriter. You can make as many {@link QueueReader}s as you - * want, but you should make them before you start writing data. - */ - public TREADER newReader() { - if (!allowReaderCreation) { - throw new RuntimeException("Logic error: must allocate readers before writing any data"); - } - return readerFactory.apply(sync, tail); - } - - /** - * This supports an "early flush" for callers like {@link DenseStorageWriter} who want to flush all their queues - * from time to time. - */ - public void flush() { - flush(false); - } - - /** - * Flush can be called at any time... when the block is empty (and hence nothing to flush), when there's some data, - * or when the data is full. - * - * @param isLast Whether this is the last node in the linked list. - */ - private void flush(boolean isLast) { - // Sometimes our users ask us to flush even if there is nothing to flush. - // If the block is an "isLast" block, we need to flush it regardless of whether it contains data. - // Otherwise (if the block is not an "isLast" block), we only flush it if it contains data. - if (!isLast && (current == begin)) { - // No need to flush. - return; - } - - // No more creating readers after the first flush. - allowReaderCreation = false; - - final QueueNode newBlob = new QueueNode<>(genericBlock, begin, current, isLast); - // If this is an early flush (before the block was filled), the next node may share - // the same underlying storage array (but disjoint segments of that array) as the current node. - // To accomplish this, we just advance "begin" to "current" here. At this point in the logic - // we don't care if that leaves the block with zero capacity (begin == end) or not. The decision - // to actually start a new block is done by the addXXX code in our subclasses which eventually - // calls flushAndAllocate. - begin = current; - synchronized (sync) { - tail.next = newBlob; - tail = newBlob; - sync.notifyAll(); - } - } - - /** - * This method exists as a helper method for a subclass' addXXX method. A typical implementation is in CharWriter: - * - *
    -     * final int sliceSize = cs.size();
    -     * final boolean flushHappened = current + sliceSize > end;
    -     * if (flushHappened) {
    -     *   typedBlock = flushAndAllocate(sliceSize);
    -     * }
    -     * ...
    -     * 
    - * - * The "flushHappened" variable (which at the point of its definition would be more precisely interpreted as "flush - * is about to happen") calculates whether the data that currently needs to be written can fit in the current block - * or not. If it can fit, the code continues on to write its data. If it can't fit, the subclass calls this - * flushAndAllocate method to flush the current block to the linked list and allocate a new one. The new block so - * allocated is guaranteed to have at be of size at least 'sizeNeeded'. - */ - protected final TARRAY flushAndAllocate(int sizeNeeded) { - flush(false); - final int capacity = Math.max(blockSize, sizeNeeded); - genericBlock = arrayFactory.apply(capacity); - begin = 0; - current = 0; - end = capacity; - return genericBlock; - } - - /** - * A QueueWriter specialized for bytes. - */ - public static final class ByteWriter extends QueueWriter { - private byte[] typedBlock = null; - - public ByteWriter(final int blockSize) { - super(blockSize, byte[]::new, QueueReader.ByteReader::new); - } - - /** - * Add bytes from a ByteSlice to the queue. - * - * @return true if the add caused a flush to happen prior to the write, false if no flush happened. - */ - public boolean addBytes(ByteSlice bs) { - final int sliceSize = bs.size(); - if (sliceSize == 0) { - return false; - } - final boolean flushHappened = current + sliceSize > end; - if (flushHappened) { - typedBlock = flushAndAllocate(sliceSize); - } - bs.copyTo(typedBlock, current); - current += sliceSize; - return flushHappened; - } - } - - /** - * A QueueWriter specialized for ints. - */ - public static final class IntWriter extends QueueWriter { - private int[] typedBlock = null; - - public IntWriter(final int blockSize) { - super(blockSize, int[]::new, QueueReader.IntReader::new); - } - - /** - * Add an int to the queue. - * - * @return true if the add caused a flush to happen prior to the write, false if no flush happened. - */ - public boolean addInt(int value) { - final boolean flushHappened = current == end; - if (flushHappened) { - typedBlock = flushAndAllocate(1); - } - typedBlock[current++] = value; - return flushHappened; - } - } - - /** - * A QueueWriter specialized for byte arrays. - */ - public static final class ByteArrayWriter extends QueueWriter { - private byte[][] block = null; - - public ByteArrayWriter(int blobSize) { - super(blobSize, byte[][]::new, QueueReader.ByteArrayReader::new); - } - - /** - * Add a byte array to the queue. - * - * @return true if the add caused a flush to happen prior to the write, false if no flush happened. - */ - public boolean addByteArray(byte[] value) { - final boolean flushHappened = current == end; - if (flushHappened) { - block = flushAndAllocate(1); - } - block[current++] = value; - return flushHappened; - } - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/BooleanAsByteParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/BooleanAsByteParser.java deleted file mode 100644 index 2aa35b5376f..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/BooleanAsByteParser.java +++ /dev/null @@ -1,60 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableBoolean; -import org.jetbrains.annotations.NotNull; - -/** - * The parser for the boolean (as byte) type. - */ -public final class BooleanAsByteParser implements Parser { - public static BooleanAsByteParser INSTANCE = new BooleanAsByteParser(); - - private BooleanAsByteParser() {} - - @NotNull - @Override - public ParserContext makeParserContext(final GlobalContext gctx, final int chunkSize) { - final Sink sink = gctx.sinkFactory.forBooleanAsByte(); - return new ParserContext<>(sink, null, new byte[chunkSize]); - } - - @Override - public long tryParse(final GlobalContext gctx, final ParserContext pctx, IteratorHolder ih, - final long begin, final long end, final boolean appending) throws CsvReaderException { - final MutableBoolean booleanHolder = new MutableBoolean(); - final Tokenizer t = gctx.tokenizer; - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final byte[] values = pctx.valueChunk(); - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - if (!t.tryParseBoolean(ih.bs(), booleanHolder)) { - break; - } - gctx.isNullOrWidthOneSoFar = false; - values[chunkIndex] = booleanHolder.booleanValue() ? (byte) 1 : (byte) 0; - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/ByteParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/ByteParser.java deleted file mode 100644 index 1e786a4d5f9..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/ByteParser.java +++ /dev/null @@ -1,75 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.sinks.Source; -import io.deephaven.csv.tokenization.RangeTests; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableLong; -import org.apache.commons.lang3.mutable.MutableObject; -import org.jetbrains.annotations.NotNull; - -/** - * The parser for the byte type. - */ -public final class ByteParser implements Parser { - public static final ByteParser INSTANCE = new ByteParser(); - - private ByteParser() {} - - @NotNull - @Override - public ParserContext makeParserContext(final GlobalContext gctx, final int chunkSize) { - final MutableObject> sourceHolder = new MutableObject<>(); - final Sink sink = gctx.sinkFactory.forByte(sourceHolder); - return new ParserContext<>(sink, sourceHolder.getValue(), new byte[chunkSize]); - } - - @Override - public long tryParse(final GlobalContext gctx, final ParserContext pctx, IteratorHolder ih, - final long begin, final long end, final boolean appending) throws CsvReaderException { - final MutableLong longHolder = new MutableLong(); - final Tokenizer t = gctx.tokenizer; - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final Byte reservedValue = gctx.sinkFactory.reservedByte(); - final byte[] values = pctx.valueChunk(); - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - if (!t.tryParseLong(ih.bs(), longHolder)) { - break; - } - final long value = longHolder.longValue(); - if (!RangeTests.isInRangeForByte(value)) { - break; - } - if (reservedValue != null && value == reservedValue) { - // If a reserved value is defined, it must not be present in the input. - break; - } - if (ih.bs().size() != 1) { - gctx.isNullOrWidthOneSoFar = false; - } - values[chunkIndex] = (byte) value; - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/CharParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/CharParser.java deleted file mode 100644 index e0559f552cc..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/CharParser.java +++ /dev/null @@ -1,70 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableInt; -import org.jetbrains.annotations.NotNull; - -/** - * The parser for the char type. - */ -public final class CharParser implements Parser { - public static final CharParser INSTANCE = new CharParser(); - - private CharParser() {} - - @NotNull - @Override - public ParserContext makeParserContext(final GlobalContext gctx, final int chunkSize) { - final Sink sink = gctx.sinkFactory.forChar(); - return new ParserContext<>(sink, null, new char[chunkSize]); - } - - @Override - public long tryParse(final GlobalContext gctx, final ParserContext pctx, IteratorHolder ih, - final long begin, final long end, final boolean appending) throws CsvReaderException { - final MutableInt intHolder = new MutableInt(); - final Tokenizer t = gctx.tokenizer; - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final Character reservedValue = gctx.sinkFactory.reservedChar(); - final char[] values = pctx.valueChunk(); - - if (!gctx.isNullOrWidthOneSoFar) { - return begin; - } - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - if (!t.tryParseBMPChar(ih.bs(), intHolder)) { - gctx.isNullOrWidthOneSoFar = false; - break; - } - final char value = (char) intHolder.intValue(); - if (reservedValue != null && value == reservedValue) { - // If a sentinel null value is defined, it cannot be present in the input. - break; - } - values[chunkIndex] = value; - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/DateTimeAsLongParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/DateTimeAsLongParser.java deleted file mode 100644 index 38b6cfc291a..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/DateTimeAsLongParser.java +++ /dev/null @@ -1,68 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableLong; -import org.jetbrains.annotations.NotNull; - -/** - * The parser for the Deephaven DateTime (represented as long) type. - */ -public final class DateTimeAsLongParser implements Parser { - public static final DateTimeAsLongParser INSTANCE = new DateTimeAsLongParser(); - - private DateTimeAsLongParser() {} - - @NotNull - @Override - public ParserContext makeParserContext(final GlobalContext gctx, final int chunkSize) { - final Sink sink = gctx.sinkFactory.forDateTimeAsLong(); - return new ParserContext<>(sink, null, new long[chunkSize]); - } - - @Override - public long tryParse(final GlobalContext gctx, final ParserContext pctx, IteratorHolder ih, - final long begin, final long end, final boolean appending) throws CsvReaderException { - final MutableLong dateTimeAsLongHolder = new MutableLong(); - final Tokenizer t = gctx.tokenizer; - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final Long reservedValue = gctx.sinkFactory.reservedLong(); - final long[] values = pctx.valueChunk(); - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - if (!t.tryParseDateTime(ih.bs(), dateTimeAsLongHolder)) { - break; - } - final long value = dateTimeAsLongHolder.longValue(); - if (reservedValue != null && value == reservedValue) { - // If a reserved value is defined, it must not be present in the input. - break; - } - if (ih.bs().size() > 1) { - gctx.isNullOrWidthOneSoFar = false; - } - values[chunkIndex] = value; - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/DoubleParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/DoubleParser.java deleted file mode 100644 index b6bf89f8094..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/DoubleParser.java +++ /dev/null @@ -1,68 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableDouble; -import org.jetbrains.annotations.NotNull; - -/** - * The parser for the double type. - */ -public final class DoubleParser implements Parser { - public static final DoubleParser INSTANCE = new DoubleParser(); - - private DoubleParser() {} - - @NotNull - @Override - public ParserContext makeParserContext(final GlobalContext gctx, final int chunkSize) { - final Sink sink = gctx.sinkFactory.forDouble(); - return new ParserContext<>(sink, null, new double[chunkSize]); - } - - @Override - public long tryParse(final GlobalContext gctx, final ParserContext pctx, IteratorHolder ih, - final long begin, final long end, final boolean appending) throws CsvReaderException { - final MutableDouble doubleHolder = new MutableDouble(); - final Tokenizer t = gctx.tokenizer; - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final Double reservedValue = gctx.sinkFactory.reservedDouble(); - final double[] values = pctx.valueChunk(); - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - if (!t.tryParseDouble(ih.bs(), doubleHolder)) { - break; - } - final double value = doubleHolder.doubleValue(); - if (reservedValue != null && value == reservedValue) { - // If a reserved value is defined, it must not be present in the input. - break; - } - if (ih.bs().size() > 1) { - gctx.isNullOrWidthOneSoFar = false; - } - values[chunkIndex] = value; - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatFastParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatFastParser.java deleted file mode 100644 index d49d8270993..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatFastParser.java +++ /dev/null @@ -1,76 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.tokenization.RangeTests; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableDouble; -import org.jetbrains.annotations.NotNull; - -/** - * The parser for the float type. Uses the FastDoubleParser library. Callers who want the exact semantics of - * {@link Float#parseFloat} should use the {@link FloatStrictParser} instead. Most callers won't care, but the reason we - * provide two parsers is that there are some inputs for which {@code Float.parseFloat(input)} differs slightly from - * {@code (float)Double.parseDouble(input)}. Callers that want exactly the answer that {@link Float#parseFloat} provides - * should use {@link FloatStrictParser}. - */ -public final class FloatFastParser implements Parser { - public static final FloatFastParser INSTANCE = new FloatFastParser(); - - private FloatFastParser() {} - - @NotNull - @Override - public ParserContext makeParserContext(final GlobalContext gctx, final int chunkSize) { - final Sink sink = gctx.sinkFactory.forFloat(); - return new ParserContext<>(sink, null, new float[chunkSize]); - } - - @Override - public long tryParse(final GlobalContext gctx, final ParserContext pctx, IteratorHolder ih, - final long begin, final long end, final boolean appending) throws CsvReaderException { - final MutableDouble doubleHolder = new MutableDouble(); - final Tokenizer t = gctx.tokenizer; - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final Float reservedValue = gctx.sinkFactory.reservedFloat(); - final float[] values = pctx.valueChunk(); - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - if (!t.tryParseDouble(ih.bs(), doubleHolder)) { - break; - } - final double value = doubleHolder.doubleValue(); - if (!RangeTests.isInRangeForFloat(value)) { - break; - } - if (reservedValue != null && value == reservedValue) { - // If a reserved value is defined, it must not be present in the input. - break; - } - if (ih.bs().size() > 1) { - gctx.isNullOrWidthOneSoFar = false; - } - values[chunkIndex] = (float) value; - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatStrictParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatStrictParser.java deleted file mode 100644 index e7e503896d6..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/FloatStrictParser.java +++ /dev/null @@ -1,71 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableFloat; -import org.jetbrains.annotations.NotNull; - -/** - * The parser for the float type. Uses the builtin Java {@link Float#parseFloat} method. Callers who want faster parsing - * and don't need the exact semantics of {@link Float#parseFloat} should use the {@link FloatFastParser} instead. Most - * callers won't care, but the reason we provide two parsers is that there are some inputs for which - * {@code Float.parseFloat(input)} differs slightly from {@code (float)Double.parseDouble(input)}. - */ -public final class FloatStrictParser implements Parser { - public static final FloatStrictParser INSTANCE = new FloatStrictParser(); - - private FloatStrictParser() {} - - @NotNull - @Override - public ParserContext makeParserContext(final GlobalContext gctx, final int chunkSize) { - final Sink sink = gctx.sinkFactory.forFloat(); - return new ParserContext<>(sink, null, new float[chunkSize]); - } - - @Override - public long tryParse(final GlobalContext gctx, final ParserContext pctx, IteratorHolder ih, - final long begin, final long end, final boolean appending) throws CsvReaderException { - final MutableFloat floatHolder = new MutableFloat(); - final Tokenizer t = gctx.tokenizer; - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final Float reservedValue = gctx.sinkFactory.reservedFloat(); - final float[] values = pctx.valueChunk(); - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - if (!t.tryParseFloatStrict(ih.bs(), floatHolder)) { - break; - } - final float value = floatHolder.floatValue(); - if (reservedValue != null && value == reservedValue) { - // If a reserved value is defined, it must not be present in the input. - break; - } - if (ih.bs().size() > 1) { - gctx.isNullOrWidthOneSoFar = false; - } - values[chunkIndex] = value; - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/IntParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/IntParser.java deleted file mode 100644 index 7db83b92175..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/IntParser.java +++ /dev/null @@ -1,77 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.sinks.SinkFactory; -import io.deephaven.csv.sinks.Source; -import io.deephaven.csv.tokenization.RangeTests; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableLong; -import org.apache.commons.lang3.mutable.MutableObject; -import org.jetbrains.annotations.NotNull; - -/** - * The parser for the int type. - */ -public final class IntParser implements Parser { - public static final IntParser INSTANCE = new IntParser(); - - private IntParser() {} - - @NotNull - @Override - public ParserContext makeParserContext(final GlobalContext gctx, final int chunkSize) { - final MutableObject> sourceHolder = new MutableObject<>(); - final Sink sink = gctx.sinkFactory.forInt(sourceHolder); - return new ParserContext<>(sink, sourceHolder.getValue(), new int[chunkSize]); - } - - @Override - public long tryParse(final GlobalContext gctx, final ParserContext pctx, IteratorHolder ih, - final long begin, final long end, final boolean appending) throws CsvReaderException { - final MutableLong longHolder = new MutableLong(); - final Tokenizer t = gctx.tokenizer; - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final Integer reservedValue = gctx.sinkFactory.reservedInt(); - final int[] values = pctx.valueChunk(); - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - if (!t.tryParseLong(ih.bs(), longHolder)) { - break; - } - final long value = longHolder.longValue(); - if (!RangeTests.isInRangeForInt(value)) { - break; - } - if (reservedValue != null && value == reservedValue) { - // If a reserved value is defined, it must not be present in the input. - break; - } - if (ih.bs().size() > 1) { - // Not an error, but needed in case we eventually fall back to char. - gctx.isNullOrWidthOneSoFar = false; - } - values[chunkIndex] = (int) value; - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/IteratorHolder.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/IteratorHolder.java deleted file mode 100644 index c10babb676c..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/IteratorHolder.java +++ /dev/null @@ -1,78 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.densestorage.DenseStorageReader; -import io.deephaven.csv.containers.ByteSlice; -import io.deephaven.csv.tokenization.RangeTests; -import io.deephaven.csv.util.CsvReaderException; - -/** - * This class is used to hold the underlying {@link DenseStorageReader} plus some associated helper information (an - * allocated {@link ByteSlice} for slice storage, plus a couple helpful statistics like {@link #numConsumed} and - * {@link #isExhausted}. - */ -public final class IteratorHolder { - /** - * The {@link DenseStorageReader} for the input text. - */ - private final DenseStorageReader dsr; - /** - * Storage for our reusable byte slice. Data inside it is valid after a call to tryMoveNext() returns true, in the - * case where hasBytes has been set to true. - */ - private final ByteSlice bs = new ByteSlice(); - /** - * Number of successful calls so far to tryMoveNext (i.e. those that returned true). - */ - private long numConsumed = 0; - /** - * Valid anytime after the first call to tryMoveNext(), but not before. - */ - private boolean isExhausted = false; - - /** - * Constructor. - */ - public IteratorHolder(DenseStorageReader dsr) { - this.dsr = dsr; - } - - /** - * Try to advance to the next (or very first) item. - * - * @return true if we were able to advance, and set {@link IteratorHolder#bs} to valid text. Otherwise false. - */ - public boolean tryMoveNext() throws CsvReaderException { - isExhausted = !dsr.tryGetNextSlice(bs); - if (isExhausted) { - return false; - } - ++numConsumed; - return true; - } - - /** - * Getter for the byte slice. - */ - public ByteSlice bs() { - return bs; - } - - /** - * Number of items we've consumed so far. This is the number of times {@link #tryMoveNext} has been called and - * returned true. - * - * @return The number of items we've consumed so far - */ - public long numConsumed() { - return numConsumed; - } - - /** - * Is the iteration exhausted? This is set to true when {@link #tryMoveNext} is called and returns false. - * - * @return Whether the iteration is exhausted. - */ - public boolean isExhausted() { - return isExhausted; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/LongParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/LongParser.java deleted file mode 100644 index b33e4a6cf40..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/LongParser.java +++ /dev/null @@ -1,71 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.sinks.Source; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableLong; -import org.apache.commons.lang3.mutable.MutableObject; -import org.jetbrains.annotations.NotNull; - -/** - * The parser for the long type. - */ -public final class LongParser implements Parser { - public static final LongParser INSTANCE = new LongParser(); - - private LongParser() {} - - @NotNull - @Override - public ParserContext makeParserContext(final GlobalContext gctx, final int chunkSize) { - final MutableObject> sourceHolder = new MutableObject<>(); - final Sink sink = gctx.sinkFactory.forLong(sourceHolder); - return new ParserContext<>(sink, sourceHolder.getValue(), new long[chunkSize]); - } - - @Override - public long tryParse(final GlobalContext gctx, final ParserContext pctx, IteratorHolder ih, - final long begin, final long end, final boolean appending) throws CsvReaderException { - final MutableLong longHolder = new MutableLong(); - final Tokenizer t = gctx.tokenizer; - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final Long reservedValue = gctx.sinkFactory.reservedLong(); - final long[] values = pctx.valueChunk(); - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - if (!t.tryParseLong(ih.bs(), longHolder)) { - break; - } - final long value = longHolder.longValue(); - if (reservedValue != null && value == reservedValue) { - // If a reserved value is defined, it must not be present in the input. - break; - } - if (ih.bs().size() > 1) { - gctx.isNullOrWidthOneSoFar = false; - } - values[chunkIndex] = value; - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/Parser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/Parser.java deleted file mode 100644 index deeb0f646fe..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/Parser.java +++ /dev/null @@ -1,146 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.sinks.SinkFactory; -import io.deephaven.csv.sinks.Source; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import org.jetbrains.annotations.NotNull; - -import java.nio.charset.StandardCharsets; -import java.util.Arrays; - -/** - * The Parser interface to the CsvReader. This is implemented by all the built-in parsers {@link IntParser}, - * {@link DoubleParser}, etc, as well as user-defined custom parsers. - * - * @param - */ -public interface Parser { - int CHUNK_SIZE = 65536 * 4; - - /** - * Make a context object for the parser. Sample implementation:
    -     * final MySink sink = new MySink();
    -     * return new ParserContext<>(sink, null, new MyType[chunkSize]);
    -     * 
    - * - *

    - * Note that parsers other than {Byte,Short,Int,Long}Parser can leave the source field null, as in the above - * example. - * - * @param gctx The GlobalContext. Built-in parsers use this to access the SinkFactory so that they can make a Sink - * of the right type. Custom parsers will probably not need this. - * @param chunkSize The size of the chunk to create. - * @return The ParserContext. - */ - @NotNull - ParserContext makeParserContext(final GlobalContext gctx, final int chunkSize); - - /** - * Tries to parse the data pointed to by IteratorHolder 'ih' into a Sink. The method parses as many values as it - * can. It stops when: - *

      - *
    1. The range [{@code destBegin},{@code destEnd}) is full, or
    2. - *
    3. The iterator {@code ih} is exhausted, or
    4. - *
    5. The code encounters a source value that it is unable to parse.
    6. - *
    - * - * @param gctx The {@link GlobalContext} holding various shared parameters for the parse. This will be shared among - * parsers of different types as the type inference process proceeds. - * @param pctx The {@link ParserContext} for this specific parser. It will be the object created by the call to - * {Parser#makeContext}. If the caller calls {@link Parser#tryParse} multiple times (for example during - * two-phase parsing), it will pass the same {@link ParserContext} object each time. - * @param ih An IteratorHolder pointing to the data. It is already pointing to the current element or the end (in - * other words, it has had {@link IteratorHolder#tryMoveNext} called on it at least once). The reason for - * this invariant is because other code (controlling logic and other parsers) have needed to peek at the - * current element before getting here in order to decide what to do. - * @param begin The start of the range (inclusive) to write values to. - * @param end The end of the range (exclusive) to write values to. This can also be a very large value like - * Long.MAX_VALUE if the caller does not know how many values there are. - * @param appending Whether the parser is being called in a mode where it is appending to the end of the - * {@link Sink} or replacing previously-written pad values in the {@link Sink}. This value is simply passed - * on to {@link Sink#write} which may use it as a hint to slightly simplify its logic. - * @return The end range (exclusive) of the values parsed. Returns {@code begin} if no values were parsed. - */ - long tryParse(GlobalContext gctx, ParserContext pctx, IteratorHolder ih, - long begin, long end, boolean appending) throws CsvReaderException; - - class GlobalContext { - /** - * The Tokenizer is responsible for parsing entities like ints, doubles, supported DateTime formats, etc. - */ - public final Tokenizer tokenizer; - /** - * Caller-specified interface for making all the various Sink<TARRAY> types. - */ - public final SinkFactory sinkFactory; - /** - * Whether all the cells seen so far are the "null" indicator (usually the empty string), or are 1 character in - * length. This is used when inferring char vs String. - */ - public boolean isNullOrWidthOneSoFar; - /** - * If the null sentinel is not the empty string, then this field contains the UTF-8 encoded bytes of the null - * sentinel string. Otherwise this field contains null. - */ - private final byte[] nullSentinelBytes; - /** - * An "isNull" chunk - */ - private final boolean[] nullChunk; - - public GlobalContext(final Tokenizer tokenizer, final SinkFactory sinkFactory, final String nullValueLiteral) { - this.tokenizer = tokenizer; - this.sinkFactory = sinkFactory; - isNullOrWidthOneSoFar = true; - - // Process the nullValueLiteral into a byte array so the isNullCell test can run quickly. - nullSentinelBytes = nullValueLiteral.getBytes(StandardCharsets.UTF_8); - nullChunk = new boolean[CHUNK_SIZE]; - } - - /** - * Determines whether the iterator's current text contains the null value literal. The notion of "null value - * literal" is user-configurable on a per-column basis, but is typically the empty string. - * - * @return whether the iterator's current text contains the null cell. - */ - public boolean isNullCell(final IteratorHolder ih) { - // A possibly-needless optimization. - if (nullSentinelBytes.length == 0) { - return ih.bs().size() == 0; - } - return Arrays.equals(ih.bs().data(), ih.bs().begin(), ih.bs().end(), - nullSentinelBytes, 0, nullSentinelBytes.length); - } - - public boolean[] nullChunk() { - return nullChunk; - } - } - - class ParserContext { - private final Sink sink; - private final Source source; - private final TARRAY valueChunk; - - public ParserContext(Sink sink, Source source, TARRAY valueChunk) { - this.sink = sink; - this.source = source; - this.valueChunk = valueChunk; - } - - public Sink sink() { - return sink; - } - - public Source source() { - return source; - } - - public TARRAY valueChunk() { - return valueChunk; - } - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/Parsers.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/Parsers.java deleted file mode 100644 index 9147775435e..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/Parsers.java +++ /dev/null @@ -1,111 +0,0 @@ -package io.deephaven.csv.parsers; - -import java.util.ArrayList; -import java.util.List; - -/** - * Standard system parsers for the {@link io.deephaven.csv.reading.CsvReader}. - */ -public class Parsers { - public static final Parser BOOLEAN = BooleanAsByteParser.INSTANCE; - public static final Parser BYTE = ByteParser.INSTANCE; - public static final Parser SHORT = ShortParser.INSTANCE; - public static final Parser INT = IntParser.INSTANCE; - public static final Parser LONG = LongParser.INSTANCE; - public static final Parser FLOAT_FAST = FloatFastParser.INSTANCE; - public static final Parser FLOAT_STRICT = FloatStrictParser.INSTANCE; - public static final Parser DOUBLE = DoubleParser.INSTANCE; - public static final Parser DATETIME = DateTimeAsLongParser.INSTANCE; - public static final Parser CHAR = CharParser.INSTANCE; - public static final Parser STRING = StringParser.INSTANCE; - public static final Parser TIMESTAMP_SECONDS = TimestampSecondsParser.INSTANCE; - public static final Parser TIMESTAMP_MILLIS = TimestampMillisParser.INSTANCE; - public static final Parser TIMESTAMP_MICROS = TimestampMicrosParser.INSTANCE; - public static final Parser TIMESTAMP_NANOS = TimestampNanosParser.INSTANCE; - - /** - * Notably, BYTE, SHORT, and FLOAT are not in the list of standard parsers. The TIMESTAMP_* parsers are never - * included by default, because they look like ints/longs. - */ - public static final List> DEFAULT = List.of( - BOOLEAN, - INT, - LONG, - DOUBLE, - DATETIME, - CHAR, - STRING); - - /** - * The above plus BYTE. The TIMESTAMP_* parsers are never included by default, because they look like ints/longs. - */ - public static final List> COMPLETE = List.of( - BOOLEAN, - BYTE, - SHORT, - INT, - LONG, - DOUBLE, - DATETIME, - CHAR, - STRING); - - /** - * Like COMPLETE but with FLOAT_FAST rather than DOUBLE. - */ - public static final List> COMPLETE_FLOAT = List.of( - BOOLEAN, - BYTE, - SHORT, - INT, - LONG, - FLOAT_FAST, - DATETIME, - CHAR, - STRING); - - /** - * Minimal - */ - public static final List> MINIMAL = List.of( - BOOLEAN, - LONG, - DOUBLE, - DATETIME, - STRING); - - /** - * Strings only. - */ - public static final List> STRINGS = List.of(STRING); - - /** - * DateTime, Double, Boolean, Char, String, and timestamp (seconds). - */ - public static final List> STANDARD_TIMES = someOtherParsersAnd(Parsers.TIMESTAMP_SECONDS); - - /** - * DateTime, Double, Boolean, Char, String, and timestamp (milliseconds). - */ - public static final List> STANDARD_MILLITIMES = someOtherParsersAnd(Parsers.TIMESTAMP_MILLIS); - - /** - * DateTime, Double, Boolean, Char, String, and timestamp (microseconds). - */ - public static final List> STANDARD_MICROTIMES = someOtherParsersAnd(Parsers.TIMESTAMP_MICROS); - - /** - * DateTime, Double, Boolean, Char, String, and timestamp (nanoseconds). - */ - public static final List> STANDARD_NANOTIMES = someOtherParsersAnd(Parsers.TIMESTAMP_NANOS); - - private static List> someOtherParsersAnd(final Parser oneMore) { - final List> result = new ArrayList<>(); - result.add(BOOLEAN); - result.add(DATETIME); - result.add(CHAR); - result.add(STRING); - result.add(oneMore); - return List.copyOf(result); - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/ShortParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/ShortParser.java deleted file mode 100644 index b09a4f43f3c..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/ShortParser.java +++ /dev/null @@ -1,75 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.sinks.Source; -import io.deephaven.csv.tokenization.RangeTests; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableLong; -import org.apache.commons.lang3.mutable.MutableObject; -import org.jetbrains.annotations.NotNull; - -/** - * The parser for the short type. - */ -public final class ShortParser implements Parser { - public static final ShortParser INSTANCE = new ShortParser(); - - private ShortParser() {} - - @NotNull - @Override - public ParserContext makeParserContext(final GlobalContext gctx, final int chunkSize) { - final MutableObject> sourceHolder = new MutableObject<>(); - final Sink sink = gctx.sinkFactory.forShort(sourceHolder); - return new ParserContext<>(sink, sourceHolder.getValue(), new short[chunkSize]); - } - - @Override - public long tryParse(final GlobalContext gctx, final ParserContext pctx, IteratorHolder ih, - final long begin, final long end, final boolean appending) throws CsvReaderException { - final MutableLong longHolder = new MutableLong(); - final Tokenizer t = gctx.tokenizer; - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final Short reservedValue = gctx.sinkFactory.reservedShort(); - final short[] values = pctx.valueChunk(); - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - if (!t.tryParseLong(ih.bs(), longHolder)) { - break; - } - final long value = longHolder.longValue(); - if (!RangeTests.isInRangeForShort(value)) { - break; - } - if (reservedValue != null && value == reservedValue) { - // If a reserved value is defined, it must not be present in the input. - break; - } - if (ih.bs().size() > 1) { - gctx.isNullOrWidthOneSoFar = false; - } - values[chunkIndex] = (short) value; - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/StringParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/StringParser.java deleted file mode 100644 index 52f8b307302..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/StringParser.java +++ /dev/null @@ -1,58 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.util.CsvReaderException; -import org.jetbrains.annotations.NotNull; - -/** - * The parser for the String type. - */ -public final class StringParser implements Parser { - public static final StringParser INSTANCE = new StringParser(); - - private StringParser() {} - - @NotNull - @Override - public ParserContext makeParserContext(final GlobalContext gctx, final int chunkSize) { - final Sink sink = gctx.sinkFactory.forString(); - return new ParserContext<>(sink, null, new String[chunkSize]); - } - - @Override - public long tryParse(final GlobalContext gctx, final ParserContext pctx, IteratorHolder ih, - final long begin, final long end, final boolean appending) throws CsvReaderException { - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final String reservedValue = gctx.sinkFactory.reservedString(); - final String[] values = pctx.valueChunk(); - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - final String value = ih.bs().toString(); - if (value.equals(reservedValue)) { - // If a reserved value is defined, it must not be present in the input. - break; - } - values[chunkIndex] = value; - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMicrosParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMicrosParser.java deleted file mode 100644 index 212b1a925b3..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMicrosParser.java +++ /dev/null @@ -1,12 +0,0 @@ -package io.deephaven.csv.parsers; - -/** - * The parser for "microseconds since Unix epoch". - */ -public class TimestampMicrosParser extends TimestampParserBase { - public static final TimestampMicrosParser INSTANCE = new TimestampMicrosParser(); - - private TimestampMicrosParser() { - super(MICROSECOND_SCALE); - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMillisParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMillisParser.java deleted file mode 100644 index 34357238aef..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampMillisParser.java +++ /dev/null @@ -1,12 +0,0 @@ -package io.deephaven.csv.parsers; - -/** - * The parser for "milliseconds since Unix epoch". - */ -public class TimestampMillisParser extends TimestampParserBase { - public static final TimestampMillisParser INSTANCE = new TimestampMillisParser(); - - private TimestampMillisParser() { - super(MILLISECOND_SCALE); - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampNanosParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampNanosParser.java deleted file mode 100644 index 616d04c3e39..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampNanosParser.java +++ /dev/null @@ -1,12 +0,0 @@ -package io.deephaven.csv.parsers; - -/** - * The parser for "nanoseconds since Unix epoch". - */ -public class TimestampNanosParser extends TimestampParserBase { - public static final TimestampNanosParser INSTANCE = new TimestampNanosParser(); - - private TimestampNanosParser() { - super(NANOSECOND_SCALE); - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampParserBase.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampParserBase.java deleted file mode 100644 index 49ea37a6a74..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampParserBase.java +++ /dev/null @@ -1,89 +0,0 @@ -package io.deephaven.csv.parsers; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.sinks.SinkFactory; -import io.deephaven.csv.sinks.Source; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableLong; -import org.apache.commons.lang3.mutable.MutableObject; -import org.jetbrains.annotations.NotNull; - -/** - * The base class for various timestamp parsers. These parsers parse longs, scale them by some appropriate value, and - * then feed them to the sink for the Deephaven DateTime (as long) type. - */ -public abstract class TimestampParserBase implements Parser { - protected static final long SECOND_SCALE = 1_000_000_000; - protected static final long MILLISECOND_SCALE = 1_000_000; - protected static final long MICROSECOND_SCALE = 1_000; - protected static final long NANOSECOND_SCALE = 1; - - private final long scale; - private final long minValue; - private final long maxValue; - - /** - * @param scale: 1 for seconds, 1000 for millis, 1_000_000 for micros, 1_000_000_000 for nanos - */ - protected TimestampParserBase(long scale) { - this.scale = scale; - minValue = Long.MIN_VALUE / scale; - maxValue = Long.MAX_VALUE / scale; - } - - @NotNull - @Override - public ParserContext makeParserContext(final Parser.GlobalContext gctx, final int chunkSize) { - final Sink sink = gctx.sinkFactory.forTimestampAsLong(); - return new ParserContext<>(sink, null, new long[chunkSize]); - } - - @Override - public long tryParse(final GlobalContext gctx, final ParserContext pctx, IteratorHolder ih, - final long begin, final long end, final boolean appending) throws CsvReaderException { - final MutableLong longHolder = new MutableLong(); - final Tokenizer t = gctx.tokenizer; - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final Long reservedValue = gctx.sinkFactory.reservedLong(); - final long[] values = pctx.valueChunk(); - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - if (!t.tryParseLong(ih.bs(), longHolder)) { - break; - } - final long value = longHolder.longValue(); - if (value < minValue || value > maxValue) { - break; - } - if (reservedValue != null && value == reservedValue) { - // If a reserved value is defined, it must not be present in the input. - break; - } - if (ih.bs().size() > 1) { - gctx.isNullOrWidthOneSoFar = false; - } - values[chunkIndex] = value * scale; - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampSecondsParser.java b/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampSecondsParser.java deleted file mode 100644 index 69949aa5caa..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/parsers/TimestampSecondsParser.java +++ /dev/null @@ -1,12 +0,0 @@ -package io.deephaven.csv.parsers; - -/** - * The parser for "seconds since Unix epoch". - */ -public final class TimestampSecondsParser extends TimestampParserBase { - public static final TimestampSecondsParser INSTANCE = new TimestampSecondsParser(); - - private TimestampSecondsParser() { - super(SECOND_SCALE); - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/reading/CellGrabber.java b/extensions/csv/src/main/java/io/deephaven/csv/reading/CellGrabber.java deleted file mode 100644 index ef4d6e3d351..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/reading/CellGrabber.java +++ /dev/null @@ -1,347 +0,0 @@ -package io.deephaven.csv.reading; - -import io.deephaven.csv.containers.ByteSlice; -import io.deephaven.csv.containers.GrowableByteBuffer; -import io.deephaven.csv.tokenization.RangeTests; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableBoolean; - -import java.io.IOException; -import java.io.InputStream; - -/** - * This class is used to traverse over text from a Reader, understanding both field and line delimiters, as well as the - * CSV quoting convention, and breaking the text into cells for use by the calling code. - */ -final class CellGrabber { - /** - * Size of chunks to read from the {@link InputStream}. - */ - private static final int BUFFER_SIZE = 65536; - /** - * The {@link InputStream} for the input. - */ - private final InputStream inputStream; - /** - * The configured CSV quote character (typically '"'). - */ - private final byte quoteChar; - /** - * The configured CVS field delimiter (typically ','). - */ - private final byte fieldDelimiter; - /** - * Whether to trim leading and trailing blanks from non-quoted values. - */ - private final boolean ignoreSurroundingSpaces; - /** - * Whether to trim leading and trailing blanks from inside quoted values. - */ - private final boolean trim; - /** - * The current chunk we have read from the file. - */ - private final byte[] buffer; - /** - * Size of the last buffer chunk read. - */ - private int size; - /** - * Current offset in the buffer chunk. - */ - private int offset; - /** - * Starting offset of a contiguous span of characters we are scanning from the buffer chunk. - */ - private int startOffset; - /** - * A side buffer we have to use for edge cases. Normally we try to return a {@link ByteSlice} which shares our - * buffer[] array. But we can't do that when the input cell spans more than one buffer[] chunk, or when the input - * cell does not exactly represent the output. This latter case can happen for example when an escaped quote ("") - * needs to be returned as a single quotation mark ("). So if our input is hello""there, then we can't directly - * return a slice of the input array, because actually we need hello"there (one quotation mark, not two). - */ - private final GrowableByteBuffer spillBuffer; - /** - * Zero-based row number of the input stream. This is for informational purposes only and in particular does NOT - * refer to the number of data rows in the input. (This is because the data rows may be split across multiple lines - * and because there may or may not be headers). We track this number for the benefit of the caller, who may want to - * issue an informative error message when there is a problem. - */ - private int physicalRowNum; - - /** - * Constructor. - */ - public CellGrabber(final InputStream inputStream, final byte quoteChar, final byte fieldDelimiter, - final boolean ignoreSurroundingSpaces, final boolean trim) { - this.inputStream = inputStream; - this.quoteChar = quoteChar; - this.fieldDelimiter = fieldDelimiter; - this.ignoreSurroundingSpaces = ignoreSurroundingSpaces; - this.trim = trim; - this.buffer = new byte[BUFFER_SIZE]; - this.size = 0; - this.offset = 0; - this.startOffset = 0; - this.spillBuffer = new GrowableByteBuffer(); - this.physicalRowNum = 0; - } - - /** - * Try to grab the next cell from the input, being aware of field delimiters, line delimiters, quoting, and - * trimming. - * - * @param dest The result, as a {@link ByteSlice}. The ByteSlice is invalidated by the next call to grabNext. - * @param lastInRow An out parameter whose contents are only specified if this method returns true. Its contents - * will be set to true if the cell just read was the last cell in the row, otherwise they will be set to - * false. - * @return true if a cell was read; false if at end of input. - */ - public boolean grabNext(final ByteSlice dest, final MutableBoolean lastInRow) throws CsvReaderException { - spillBuffer.clear(); - startOffset = offset; - - if (ignoreSurroundingSpaces) { - skipWhitespace(); - } - if (!tryEnsureMore()) { - return false; - } - - // Is first char the quote char? - if (buffer[offset] == quoteChar) { - ++offset; - processQuotedMode(dest, lastInRow); - if (trim) { - trimWhitespace(dest); - } - } else { - processUnquotedMode(dest, lastInRow); - } - return true; - } - - /** - * Process characters in "quoted mode". This involves some trickery to deal with quoted quotes and the end quote. - * - * @param lastInRow An out parameter. Its contents will be set to true if the cell just read was the last cell in - * the row, otherwise the contents will be set to false. - */ - private void processQuotedMode(final ByteSlice dest, final MutableBoolean lastInRow) throws CsvReaderException { - startOffset = offset; - boolean prevCharWasCarriageReturn = false; - while (true) { - if (offset == size) { - if (!tryEnsureMore()) { - throw new CsvReaderException("Cell did not have closing quote character"); - } - } - final byte ch = buffer[offset++]; - // Maintain a correct row number. This is somehat tricky. - if (ch == '\r') { - ++physicalRowNum; - prevCharWasCarriageReturn = true; - } else { - if (ch == '\n' && !prevCharWasCarriageReturn) { - ++physicalRowNum; - } - prevCharWasCarriageReturn = false; - } - if (ch != quoteChar) { - // Ordinary character. Note: in quoted mode we will gladly eat field and line separators. - continue; - } - // This character is a quote char. It could be the end of the cell, or it could be an escaped - // quote char (e.g. ""). The way to tell is to peek ahead at the next character. - if (!tryEnsureMore()) { - // There is no next char (we are at end of input), so let's call this end of cell. - break; - } - final byte peek = buffer[offset]; - if (peek != quoteChar) { - // There is a next char, but it's not a quotation mark. So this - // quotation mark must be the end of the quoted string. - break; - } - // There is a next character, and it *is* a quotation mark. So this is a quoted quote - // "", to be interpreted as ". So we'll spill this string (up to the first quotation mark), - // skip the second quotation mark, and keep going. - spillRange(); - // Skip the second quotation mark. - ++offset; - startOffset = offset; - } - // We got out of the quoted string. Consume any trailing matter after the quote and before the field - // delimiter. Hopefully that trailing matter is just whitespace, but we shall see. - finishField(dest, lastInRow); - - // From this point on, note that dest is a slice that may point to the underlying input buffer or the spill - // buffer. Take care from this point on to not disturb the input (e.g. by reading the next chunk) or the - // spill buffer. - - // The easiest way to make all the above logic run smoothly is to let the final quotation mark - // (which will unconditionally be there) and subsequent whitespace (if any) into the field. - // Then we can simply trim it back out now. - while (dest.begin() != dest.end() && RangeTests.isSpaceOrTab(dest.back())) { - dest.setEnd(dest.end() - 1); - } - if (dest.begin() == dest.end() || dest.back() != quoteChar) { - throw new RuntimeException("Logic error: final non-whitespace in field is not quoteChar"); - } - dest.setEnd(dest.end() - 1); - } - - /** - * Process characters in "unquoted mode". This is easy: eat characters until the next field or line delimiter. - */ - private void processUnquotedMode(final ByteSlice dest, final MutableBoolean lastInRow) throws CsvReaderException { - startOffset = offset; - finishField(dest, lastInRow); - } - - /** - * Skip whitespace but do not consider the field delimiter to be whitespace. - */ - private void skipWhitespace() throws CsvReaderException { - while (true) { - if (offset == size) { - if (!tryEnsureMore()) { - return; - } - } - final byte ch = buffer[offset]; - if (ch == fieldDelimiter || !RangeTests.isSpaceOrTab(ch)) { - return; - } - ++offset; - } - } - - /** - * Eat characters until the next field or line delimiter. - * - * @param lastInRow An out parameter. Its contents are set to true if the cell was the last one in the row. - * Otherwise, its contents are set to false. - */ - private void finishField(final ByteSlice dest, final MutableBoolean lastInRow) throws CsvReaderException { - while (true) { - if (offset == size) { - if (!tryEnsureMore()) { - finish(dest); - // End of file sets last in row. - lastInRow.setValue(true); - return; - } - } - final byte ch = buffer[offset]; - if (ch == fieldDelimiter) { - finish(dest); - ++offset; // ... and skip over the field delimiter. - lastInRow.setValue(false); - return; - } - if (ch == '\n') { - finish(dest); - ++offset; - lastInRow.setValue(true); - ++physicalRowNum; - return; - } - if (ch == '\r') { - finish(dest); - ++offset; - if (tryEnsureMore()) { - // might be \r\n - if (buffer[offset] == '\n') { - ++offset; - } - } - lastInRow.setValue(true); - ++physicalRowNum; - return; - } - ++offset; - } - } - - /** - * @return true if there are more characters. - */ - private boolean tryEnsureMore() throws CsvReaderException { - if (offset != size) { - return true; - } - spillRange(); - refillBuffer(); - return size != 0; - } - - /** - * Spill the current range to the spillBuffer. Normally we try to stay in the "common case", where the entire cell - * we are reading is consecutive characters in the underlying input buffer. This assumption fails when either there - * are escaped quotes (like "" needing to be interpreted as "), or when the cell we are reading spans the boundaries - * of two input buffers. In that case we "spill" the characters we have collected so far to the spillBuffer. - */ - private void spillRange() { - spillBuffer.append(buffer, startOffset, offset - startOffset); - startOffset = offset; - } - - /** - * Get another chunk of data from the Reader. - */ - private void refillBuffer() throws CsvReaderException { - offset = 0; - startOffset = 0; - try { - final int bytesRead = inputStream.read(buffer, 0, buffer.length); - if (bytesRead < 0) { - size = 0; - return; - } - if (bytesRead > 0) { - size = bytesRead; - return; - } - throw new CsvReaderException("Logic error: zero-length read"); - } catch (IOException inner) { - throw new CsvReaderException("Caught exception", inner); - } - } - - private void finish(final ByteSlice dest) { - if (spillBuffer.size() == 0) { - // If we never spilled then our whole output is in the input buffer. So we can - // just return a slice of the input buffer. - dest.reset(buffer, startOffset, offset); - return; - } - // Otherwise, append we need to append whatever residual is left to spillBuffer - // and return a slice of spillBuffer. - spillRange(); - dest.reset(spillBuffer.data(), 0, spillBuffer.size()); - } - - public int physicalRowNum() { - return physicalRowNum; - } - - /** - * Trim whitespace from the front and back of the slice. - * - * @param cs The slice, modified in-place to have whitespace (if any) removed. - */ - private static void trimWhitespace(final ByteSlice cs) { - final byte[] data = cs.data(); - int begin = cs.begin(); - int end = cs.end(); - while (begin != end && RangeTests.isSpaceOrTab(data[begin])) { - ++begin; - } - while (begin != end && RangeTests.isSpaceOrTab(data[end - 1])) { - --end; - } - cs.reset(data, begin, end); - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/reading/CsvReader.java b/extensions/csv/src/main/java/io/deephaven/csv/reading/CsvReader.java deleted file mode 100644 index 23e54a4540e..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/reading/CsvReader.java +++ /dev/null @@ -1,564 +0,0 @@ -package io.deephaven.csv.reading; - -import io.deephaven.csv.containers.ByteSlice; -import io.deephaven.csv.densestorage.DenseStorageReader; -import io.deephaven.csv.densestorage.DenseStorageWriter; -import io.deephaven.csv.parsers.Parser; -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.parsers.Parsers; -import io.deephaven.csv.sinks.SinkFactory; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import io.deephaven.csv.util.Renderer; -import org.apache.commons.lang3.mutable.MutableBoolean; -import org.apache.commons.lang3.mutable.MutableObject; - -import java.io.InputStream; -import java.io.Reader; -import java.util.*; -import java.util.concurrent.*; -import java.util.function.Function; -import java.util.function.Predicate; - -/** - * A class for reading CSV data. Typical usage is: - *
      - *
    1. Construct a CsvReader.
    2. - *
    3. Customize the CsvReader by calling the various setXXX methods.
    4. - *
    5. Arrange for the input text to be in a {@link Reader}.
    6. - *
    7. Prepare a {@link SinkFactory} which can in turn provide Sink<T> objects for the output data.
    8. - *
    9. Call the {@link #read} method.
    10. - *
    - * Furthermore the setXXX methods can be used in a builder pattern. Example: - * - *
    - * final CsvReader csvr = new CsvReader()
    - *   .setQuoteChar('#')
    - *   .setAsync(false)
    - *   .setParserFor("Timestamp", Parsers.DATETIME);
    - * final Reader r = ...;
    - * final SinkFactory f = ...;
    - * final CsvReader.Result res = csvr.read(r, f);
    - * 
    - */ -public final class CsvReader { - /** - * Whether to trim leading and trailing blanks from non-quoted values. - */ - private boolean ignoreSurroundingSpaces = false; - /** - * Whether to trim leading and trailing blanks from inside quoted values. - */ - private boolean trim = false; - /** - * Whether the incoming data has column headers. - */ - private boolean hasHeaders = true; - /** - * The quote character (used when you want field or line delimiters to be interpreted as literal text. For example: - * - *
    -     * 123,"hello, there",456,
    -     * 
    - * - * Would be read as the three fields: - *
      - *
    • 123
    • - *
    • hello, there
    • - *
    • 456
    • - *
    - */ - private byte quoteChar = '"'; - /** - * The field delimiter (the character that separates one column from the next. - */ - private byte fieldDelimiter = ','; - /** - * Whether to run concurrently. In particular, the operation of reading the raw file, breaking it into columns, and - * storing that column text in memory can run in parallel with parsing the data for a given column, and all the - * column data parsers can themselves run in parallel. - */ - private boolean concurrent = true; - /** - * The user-defined set of parsers that participate in type inference. Defaults to Parsers.DEFAULT - */ - private List> parsers = new ArrayList<>(Parsers.DEFAULT); - /** - * Client-specified headers that can be used to override the existing headers in the input (if hasHeaders is true), - * or to provide absent headers (if hasHeaders is false). - */ - private List clientSpecifiedHeaders = new ArrayList<>(); - /** - * Override a specific column header by number. This is applied *after* clientSpecifiedHeaders. Column numbers start - * with 1. - */ - private final Map columnHeaderOverrides = new HashMap<>(); - /** - * Used to force a specific parser for a specific column, specified by column name. - */ - private final Map> parsersByColumnName = new HashMap<>(); - /** - * Used to force a specific parser for a specific column, specified by column number. Column numbers start with 1. - */ - private final Map> parsersByColumnNumber = new HashMap<>(); - /** - * The default string that means "null value" in the input. It is used if not overridden on a per-column basis. It - * defaults to the empty string. - */ - private String nullValueLiteral = ""; - /** - * Used to force a specific parser for a specific column, specified by column name. - */ - private final Map nullValueLiteralByColumnName = new HashMap<>(); - /** - * Used to force a specific parser for a specific column, specified by column number. Column numbers start with 1. - */ - private final Map nullValueLiteralByColumnNumber = new HashMap<>(); - /** - * The parser to be used when a column is entirely null (unless a specific parser has been forced by setting an - * entry in the parsers collection. - */ - private Parser nullParser; - /** - * An optional low-level parser that understands custom time zones. - */ - private Tokenizer.CustomTimeZoneParser customTimeZoneParser; - /** - * An optional validator for column headers. - */ - private Predicate headerValidator = s -> true; - /** - * An optional legalizer for column headers. - */ - private Function headerLegalizer = Function.identity(); - - /** - * Read the data. - * - * @param stream The input data, encoded in UTF-8. - * @param sinkFactory A factory that can provide Sink<T> of all appropriate types for the output data. Once - * the CsvReader determines what the column type is, t will use the SinkFactory to create an appropriate - * Sink<T> for the type. Note that the CsvReader might guess wrong, so it might create a Sink, - * partially populate it, and then abandon it. The final set of fully-populated Sinks will be returned in in - * the CsvReader.Result. - * @return A CsvReader.Result containing the column names, the number of columns, and the final set of - * fully-populated Sinks. - */ - public Result read(final InputStream stream, final SinkFactory sinkFactory) throws CsvReaderException { - final CellGrabber grabber = new CellGrabber(stream, quoteChar, fieldDelimiter, ignoreSurroundingSpaces, trim); - // For an "out" parameter - final MutableObject firstDataRowHolder = new MutableObject<>(); - final String[] headersTemp = determineHeadersToUse(grabber, firstDataRowHolder); - final byte[][] firstDataRow = firstDataRowHolder.getValue(); - final int numInputCols = headersTemp.length; - - // If the final column has a blank header, we assume the whole column is blank (we confirm this assumption - // in ParseInputToDenseStorage, as we're reading the file. - final String[] headersTemp2; - if (numInputCols != 0 && headersTemp[numInputCols - 1].isEmpty()) { - headersTemp2 = Arrays.copyOf(headersTemp, numInputCols - 1); - } else { - headersTemp2 = headersTemp; - } - final int numOutputCols = headersTemp2.length; - final String[] headersToUse = canonicalizeHeaders(headersTemp2); - - // Create a DenseStorageWriter and two readers for each column. - final DenseStorageWriter[] dsws = new DenseStorageWriter[numInputCols]; - final DenseStorageReader[] dsr0s = new DenseStorageReader[numInputCols]; - final DenseStorageReader[] dsr1s = new DenseStorageReader[numInputCols]; - // The arrays are sized to "numInputCols" but only populated up to "numOutputCols". - // The code in ParseInputToDenseStorge knows that a null DenseStorageWriter means that the column - // is all-empty and (once the data is confirmed to be empty) just drop the data. - for (int ii = 0; ii < numOutputCols; ++ii) { - final DenseStorageWriter dsw = new DenseStorageWriter(); - dsws[ii] = dsw; - dsr0s[ii] = dsw.newReader(); - dsr1s[ii] = dsw.newReader(); - } - - // Select an Excecutor based on whether the user wants the code to run asynchronously - // or not. - final ExecutorService exec = - concurrent ? Executors.newFixedThreadPool(numOutputCols + 1) : Executors.newSingleThreadExecutor(); - - final Future numRowsFuture = exec.submit( - () -> ParseInputToDenseStorage.doit(firstDataRow, nullValueLiteral, grabber, dsws)); - - - final ArrayList>> sinkFutures = new ArrayList<>(); - - for (int ii = 0; ii < numOutputCols; ++ii) { - final List> parsersToUse = calcParsersToUse(headersToUse[ii], ii + 1); - final String nullValueLiteralToUse = calcNullValueLiteralToUse(headersToUse[ii], ii + 1); - - final int iiCopy = ii; - final Future> fcb = exec.submit( - () -> ParseDenseStorageToColumn.doit(dsr0s[iiCopy], dsr1s[iiCopy], - parsersToUse, nullParser, customTimeZoneParser, - nullValueLiteralToUse, sinkFactory)); - sinkFutures.add(fcb); - } - - final long numRows; - final Sink[] sinks = new Sink[numOutputCols]; - try { - numRows = numRowsFuture.get(); - for (int ii = 0; ii < numOutputCols; ++ii) { - sinks[ii] = sinkFutures.get(ii).get(); - } - } catch (Exception inner) { - throw new CsvReaderException("Caught exception", inner); - } - - return new Result(numRows, headersToUse, sinks); - } - - /** - * Determine which list of parsers to use for type inference. Returns {@link #parsers} unless the user has set an - * override on a column name or column number basis. - */ - private List> calcParsersToUse(final String columnName, final int oneBasedColumnNumber) { - Parser specifiedParser = parsersByColumnName.get(columnName); - if (specifiedParser != null) { - return List.of(specifiedParser); - } - specifiedParser = parsersByColumnNumber.get(oneBasedColumnNumber); - if (specifiedParser != null) { - return List.of(specifiedParser); - } - return parsers; - } - - /** - * Determine which null value literal to use. Returns {@link #nullValueLiteral} unless the user has set an override - * on a column name or column number basis. - */ - private String calcNullValueLiteralToUse(final String columnName, final int oneBasedColumnNumber) { - String result = nullValueLiteralByColumnName.get(columnName); - if (result != null) { - return result; - } - result = nullValueLiteralByColumnNumber.get(oneBasedColumnNumber); - if (result != null) { - return result; - } - return nullValueLiteral; - } - - /** - * Determine which headers to use. The result comes from either the first row of the file or the user-specified - * overrides. - */ - private String[] determineHeadersToUse(final CellGrabber grabber, final MutableObject firstDataRowHolder) - throws CsvReaderException { - String[] headersToUse = null; - if (hasHeaders) { - final byte[][] firstRow = tryReadOneRow(grabber); - if (firstRow == null) { - throw new CsvReaderException("Can't proceed because hasHeaders is set but input file is empty"); - } - headersToUse = Arrays.stream(firstRow).map(String::new).toArray(String[]::new); - } - - // Whether or not the input had headers, maybe override with client-specified headers. - if (clientSpecifiedHeaders.size() != 0) { - headersToUse = clientSpecifiedHeaders.toArray(new String[0]); - } - - // If we still have nothing, try generate synthetic column headers (works only if the file is non-empty, - // because we need to infer the column count). - final byte[][] firstDataRow; - if (headersToUse == null) { - firstDataRow = tryReadOneRow(grabber); - if (firstDataRow == null) { - throw new CsvReaderException( - "Can't proceed because input file is empty and client has not specified headers"); - } - headersToUse = new String[firstDataRow.length]; - for (int ii = 0; ii < headersToUse.length; ++ii) { - headersToUse[ii] = "Column" + (ii + 1); - } - } else { - firstDataRow = null; - } - - // Apply column specific overrides. - for (Map.Entry entry : columnHeaderOverrides.entrySet()) { - headersToUse[entry.getKey() - 1] = entry.getValue(); - } - - firstDataRowHolder.setValue(firstDataRow); - return headersToUse; - } - - private String[] canonicalizeHeaders(final String[] headers) throws CsvReaderException { - final String[] legalized = headerLegalizer.apply(headers); - final Set unique = new HashSet<>(); - final List repeats = new ArrayList<>(); - final List invalidNames = new ArrayList<>(); - for (String header : legalized) { - if (!unique.add(header)) { - repeats.add(header); - } else if (!headerValidator.test(header)) { - // Using an "else if" because we only want to run each unique name once through the validator. - invalidNames.add(header); - } - } - - if (repeats.isEmpty() && invalidNames.isEmpty()) { - return legalized; - } - - final StringBuilder sb = new StringBuilder("Some column headers are invalid."); - if (!repeats.isEmpty()) { - sb.append(" Repeated headers: "); - sb.append(Renderer.renderList(repeats)); - } - if (!invalidNames.isEmpty()) { - sb.append(" Invalid headers: "); - sb.append(Renderer.renderList(invalidNames)); - } - throw new CsvReaderException(sb.toString()); - } - - /** - * Try to read one row from the input. Returns false if the input ends before one row has been read. - * - * @return The first row as a byte[][] or null if the input was exhausted. - */ - private static byte[][] tryReadOneRow(final CellGrabber grabber) throws CsvReaderException { - final List headers = new ArrayList<>(); - - // Grab the header - final ByteSlice slice = new ByteSlice(); - final MutableBoolean lastInRow = new MutableBoolean(); - do { - if (!grabber.grabNext(slice, lastInRow)) { - return null; - } - final byte[] item = new byte[slice.size()]; - slice.copyTo(item, 0); - headers.add(item); - } while (!lastInRow.booleanValue()); - return headers.toArray(new byte[0][]); - } - - /** - * Sets whether to trim leading and trailing blanks from non-quoted values. This really only matters for columns - * that are inferred to be of type String. Numeric columns ignore surrounding whitespace regardless of this setting. - */ - public CsvReader setIgnoreSurroundingSpaces(final boolean value) { - ignoreSurroundingSpaces = value; - return this; - } - - /** - * Sets whether to trim leading and trailing blanks from inside quoted values. This really only matters for columns - * that are inferred to be of type String. Numeric columns ignore surrounding whitespace regardless of this setting. - */ - public CsvReader setTrim(final boolean value) { - trim = value; - return this; - } - - /** - * Sets whether the first row of the input is column headers. - */ - public CsvReader setHasHeaders(final boolean value) { - hasHeaders = value; - return this; - } - - /** - * Sets the field delimiter. Typically the comma or tab character. - */ - public CsvReader setFieldDelimiter(final char value) { - if (value > 0x7f) { - throw new IllegalArgumentException("Field delimiter needs to be a 7-bit ASCII character"); - } - fieldDelimiter = (byte) value; - return this; - } - - /** - * Sets the quote character. Used by the input when it needs to escape special characters like field or line - * delimiters. A doubled quote character represents itself. Examples (assuming the quote character is set to '"'): - *
      - *
    • "Hello, there": the string Hello, there
    • - *
    • "Hello""there": the string Hello"there
    • - *
    • """": the string "
    • - *
    - */ - public CsvReader setquoteChar(final char value) { - if (value > 0x7f) { - throw new IllegalArgumentException("Quote character needs to be a 7-bit ASCII character"); - } - quoteChar = (byte) value; - return this; - } - - /** - * Whether the reader should run the file tokenizer and column parsing jobs concurrently, using multiple threads. - * This typically yields better performance. - */ - public CsvReader setConcurrent(final boolean value) { - this.concurrent = value; - return this; - } - - /** - * Set the list of parsers participating in type inference. - */ - public CsvReader setParsers(final Collection> parsers) { - this.parsers = new ArrayList<>(parsers); - return this; - } - - /** - * Add parsers to the existing list of parsers participating in type inference. - */ - public CsvReader addParsers(Parser... parsers) { - this.parsers.addAll(List.of(parsers)); - return this; - } - - /** - * Overrides (if hasHeaders is true) or provides (if hasHeaders is false) the column headers. - */ - public CsvReader setHeaders(final Collection headers) { - clientSpecifiedHeaders = new ArrayList<>(headers); - return this; - } - - /** - * Overrides (if hasHeaders is true) or provides (if hasHeaders is false) the column headers. - */ - public CsvReader setHeaders(final String... headers) { - clientSpecifiedHeaders = List.of(headers); - return this; - } - - /** - * Overrides a specific column header by index. Columns are numbered starting with 1. - */ - public CsvReader setHeader(final int columnNumber, final String header) { - columnHeaderOverrides.put(columnNumber, header); - return this; - } - - /** - * Specify a parser for a given column name, rather than using inference to pick a type. - */ - public CsvReader setParserFor(final String name, final Parser parser) { - this.parsersByColumnName.put(name, parser); - return this; - } - - /** - * Specify a parser for a given column number, rather than using inference to pick a type. The column numbers are - * 1-based. - */ - public CsvReader setParserFor(final int columnNumber, final Parser parser) { - this.parsersByColumnNumber.put(columnNumber, parser); - return this; - } - - /** - * Specify the default null value literal to be used if not overridden for a column. - */ - public CsvReader setNullValueLiteral(final String nullValueLiteral) { - this.nullValueLiteral = nullValueLiteral; - return this; - } - - /** - * Specify the null value literal for a given column name. - */ - public CsvReader setNullValueLiteralFor(final String name, final String nullValueLiteral) { - this.nullValueLiteralByColumnName.put(name, nullValueLiteral); - return this; - } - - /** - * Specify a parser for a given column number, rather than using inference to pick a type. The column numbers are - * 1-based. - */ - public CsvReader setNullValueLiteralFor(final int columnNumber, final String nullValueLiteral) { - this.nullValueLiteralByColumnNumber.put(columnNumber, nullValueLiteral); - return this; - } - - /** - * Specify the parser to be used for columns that contain all nulls. (Unless that column has a parser specified by - * {@link #setParserFor}. - */ - public CsvReader setNullParser(final Parser nullParser) { - this.nullParser = nullParser; - return this; - } - - /** - * Specify a plugin to be used to parse custom time zones. This permits the caller to support custom time zones such - * as the " NY" that appears in "2020-05-05 12:34:56 NY". The first digit (here, space) must be something other than - * "Z". - */ - public CsvReader setCustomTimeZoneParser(final Tokenizer.CustomTimeZoneParser customTimeZoneParser) { - this.customTimeZoneParser = customTimeZoneParser; - return this; - } - - public CsvReader setHeaderValidator(final Predicate headerValidator) { - this.headerValidator = headerValidator; - return this; - } - - public CsvReader setHeaderLegalizer(final Function headerLegalizer) { - this.headerLegalizer = headerLegalizer; - return this; - } - - /** - * Result of {@link #read}. - */ - public static final class Result { - private final long numRows; - private final String[] columnNames; - private final Sink[] columns; - - public Result(final long numRows, final String[] columnNames, final Sink[] columns) { - this.numRows = numRows; - this.columnNames = columnNames; - this.columns = columns; - } - - /** - * Number of rows in the input. - */ - public long numRows() { - return numRows; - } - - /** - * The column names. - */ - public String[] columnNames() { - return columnNames; - } - - /** - * Data for each column. Each Sink was constructed by some method in the SinkFactory that the caller passed to - * {@link #read}. - */ - public Sink[] columns() { - return columns; - } - - /** - * The number of columns. - */ - public int numCols() { - return columns.length; - } - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java b/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java deleted file mode 100644 index 39cee007f01..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseDenseStorageToColumn.java +++ /dev/null @@ -1,376 +0,0 @@ -package io.deephaven.csv.reading; - -import io.deephaven.csv.parsers.*; -import io.deephaven.csv.densestorage.DenseStorageReader; -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.sinks.SinkFactory; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableBoolean; -import org.apache.commons.lang3.mutable.MutableDouble; -import org.apache.commons.lang3.mutable.MutableLong; -import org.jetbrains.annotations.NotNull; - -import java.util.*; -import java.util.function.IntConsumer; - -/** - * The job of this class is to take a column of cell text, as prepared by {@link ParseInputToDenseStorage}, do type - * inference if appropriate, and parse the text into typed data. - */ -public final class ParseDenseStorageToColumn { - /** - * @param dsr A reader for the input. - * @param dsrAlt A second reader for the same input (used to perform the second pass over the data, if type - * inference deems a second pass to be necessary). - * @param parsers The set of parsers to try. If null, then {@link Parsers#DEFAULT} will be used. - * @param nullValueLiteral If a cell text is equal to this value, it will be interpreted as the null value. - * Typically set to the empty string. - * @param nullParser The Parser to use if parsers.size() > 1 but the column contains all null values. This is needed - * as a backstop because otherwise type inference would have no way to choose among the multiple parsers. - * @param sinkFactory Factory that makes all of the Sinks of various types, used to consume the data we produce. - * @return The {@link Sink}, provided by the caller's {@link SinkFactory}, that was selected to hold the column - * data. - */ - public static Sink doit(final DenseStorageReader dsr, final DenseStorageReader dsrAlt, - List> parsers, final Parser nullParser, - final Tokenizer.CustomTimeZoneParser customTimeZoneParser, - final String nullValueLiteral, - final SinkFactory sinkFactory) throws CsvReaderException { - Set> parserSet = new HashSet<>(Objects.requireNonNullElse(parsers, Parsers.DEFAULT)); - - final Tokenizer tokenizer = new Tokenizer(customTimeZoneParser); - final Parser.GlobalContext gctx = new Parser.GlobalContext(tokenizer, sinkFactory, nullValueLiteral); - - // Skip over leading null cells. There are three cases: - // 1. There is a non-null cell (so the type inference process can begin) - // 2. The column is full of all nulls - // 3. The column is empty - final IteratorHolder ih = new IteratorHolder(dsr); - boolean columnIsEmpty = true; - boolean columnIsAllNulls = true; - while (ih.tryMoveNext()) { - columnIsEmpty = false; - if (!gctx.isNullCell(ih)) { - columnIsAllNulls = false; - break; - } - } - - if (columnIsAllNulls) { - // We get here in cases 2 and 3: the column is all nulls, or the column is empty. - final Parser nullParserToUse = parserSet.size() == 1 ? parserSet.iterator().next() : nullParser; - if (nullParserToUse == null) { - throw new CsvReaderException( - "Column contains all null cells: can't infer type of column, and nullParser is not set."); - } - if (columnIsEmpty) { - return emptyParse(nullParserToUse, gctx); - } - return onePhaseParse(nullParserToUse, gctx, dsrAlt); - } - - final CategorizedParsers cats = CategorizedParsers.create(parserSet); - - if (cats.customParser != null) { - return onePhaseParse(cats.customParser, gctx, dsrAlt); - } - - // Numerics are special and they get their own fast path that uses Sources and Sinks rather than - // reparsing the text input. - final MutableDouble dummyDouble = new MutableDouble(); - if (!cats.numericParsers.isEmpty() && tokenizer.tryParseDouble(ih.bs(), dummyDouble)) { - return parseNumerics(cats, gctx, ih, dsrAlt); - } - - List> universeByPrecedence = List.of(Parsers.CHAR, Parsers.STRING); - final MutableBoolean dummyBoolean = new MutableBoolean(); - final MutableLong dummyLong = new MutableLong(); - if (cats.timestampParser != null && tokenizer.tryParseLong(ih.bs(), dummyLong)) { - universeByPrecedence = List.of(cats.timestampParser, Parsers.CHAR, Parsers.STRING); - } else if (cats.booleanParser != null && tokenizer.tryParseBoolean(ih.bs(), dummyBoolean)) { - universeByPrecedence = List.of(Parsers.BOOLEAN, Parsers.STRING); - } else if (cats.dateTimeParser != null && tokenizer.tryParseDateTime(ih.bs(), dummyLong)) { - universeByPrecedence = List.of(Parsers.DATETIME, Parsers.STRING); - } - List> parsersToUse = limitToSpecified(universeByPrecedence, parserSet); - return parseFromList(parsersToUse, gctx, ih, dsrAlt); - } - - @NotNull - private static Sink parseNumerics( - CategorizedParsers cats, final Parser.GlobalContext gctx, final IteratorHolder ih, - final DenseStorageReader dsrAlt) - throws CsvReaderException { - final List wrappers = new ArrayList<>(); - for (Parser parser : cats.numericParsers) { - final ParserResultWrapper prw = parseNumericsHelper(parser, gctx, ih); - wrappers.add(prw); - if (ih.isExhausted()) { - // Parsed everything with numerics! - return unifyNumericResults(gctx, wrappers); - } - } - - return parseFromList(cats.charAndStringParsers, gctx, ih, dsrAlt); - } - - @NotNull - private static ParserResultWrapper parseNumericsHelper(Parser parser, - final Parser.GlobalContext gctx, final IteratorHolder ih) - throws CsvReaderException { - final Parser.ParserContext pctx = parser.makeParserContext(gctx, Parser.CHUNK_SIZE); - final long begin = ih.numConsumed() - 1; - final long end = parser.tryParse(gctx, pctx, ih, begin, Long.MAX_VALUE, true); - return new ParserResultWrapper(pctx, begin, end); - } - - @NotNull - private static Sink parseFromList(final List> parsers, - final Parser.GlobalContext gctx, final IteratorHolder ih, - final DenseStorageReader dsrAlt) throws CsvReaderException { - if (parsers.isEmpty()) { - throw new CsvReaderException("No available parsers."); - } - - for (int ii = 0; ii < parsers.size() - 1; ++ii) { - final Sink result = tryTwoPhaseParse(parsers.get(ii), gctx, ih, dsrAlt); - if (result != null) { - return result; - } - } - - // The final parser in the set gets special (more efficient) handling because there's nothing to fall back to. - return onePhaseParse(parsers.get(parsers.size() - 1), gctx, dsrAlt); - } - - private static Sink tryTwoPhaseParse(final Parser parser, - final Parser.GlobalContext gctx, final IteratorHolder ih, - final DenseStorageReader dsrAlt) throws CsvReaderException { - final long phaseOneStart = ih.numConsumed() - 1; - final Parser.ParserContext pctx = parser.makeParserContext(gctx, Parser.CHUNK_SIZE); - parser.tryParse(gctx, pctx, ih, phaseOneStart, Long.MAX_VALUE, true); - if (!ih.isExhausted()) { - // This parser couldn't make it to the end but there are others remaining to try. Signal a failure to the - // caller so that it can try the next one. - return null; - } - if (phaseOneStart == 0) { - // Reached end, and started at zero so everything was parsed and we are done. - return pctx.sink(); - } - - final IteratorHolder ihAlt = new IteratorHolder(dsrAlt); - ihAlt.tryMoveNext(); // Input is not empty, so we know this will succeed. - final long end = parser.tryParse(gctx, pctx, ihAlt, 0, phaseOneStart, false); - - if (end == phaseOneStart) { - return pctx.sink(); - } - final String message = - "Logic error: second parser phase failed on input. Parser was: " + parser.getClass().getCanonicalName(); - throw new RuntimeException(message); - } - - @NotNull - private static Sink onePhaseParse(final Parser parser, - final Parser.GlobalContext gctx, - final DenseStorageReader dsrAlt) throws CsvReaderException { - final Parser.ParserContext pctx = parser.makeParserContext(gctx, Parser.CHUNK_SIZE); - final IteratorHolder ihAlt = new IteratorHolder(dsrAlt); - ihAlt.tryMoveNext(); // Input is not empty, so we know this will succeed. - parser.tryParse(gctx, pctx, ihAlt, 0, Long.MAX_VALUE, true); - if (ihAlt.isExhausted()) { - return pctx.sink(); - } - final String message = "One phase parser failed on input. Parser was: " + parser.getClass().getCanonicalName(); - throw new CsvReaderException(message); - } - - @NotNull - private static Sink emptyParse(final Parser parser, - final Parser.GlobalContext gctx) throws CsvReaderException { - // The parser won't do any "parsing" here, but it will create a Sink. - final Parser.ParserContext pctx = parser.makeParserContext(gctx, Parser.CHUNK_SIZE); - parser.tryParse(gctx, pctx, null, 0, 0, true); // Result ignored. - return pctx.sink(); - } - - @NotNull - private static Sink unifyNumericResults(final Parser.GlobalContext gctx, - final List wrappers) { - if (wrappers.isEmpty()) { - throw new RuntimeException("Logic error: no parser results."); - } - final ParserResultWrapper dest = wrappers.get(wrappers.size() - 1); - - // BTW, there's an edge case where there's only one parser in the list. In that case first == dest, - // but this code still does the right thing. - final ParserResultWrapper first = wrappers.get(0); - fillNulls(gctx, dest.pctx, 0, first.begin); - - long destBegin = first.begin; - for (int ii = 0; ii < wrappers.size() - 1; ++ii) { - final ParserResultWrapper curr = wrappers.get(ii); - copy(gctx, curr.pctx, dest.pctx, curr.begin, curr.end, destBegin); - destBegin += (curr.end - curr.begin); - } - return dest.pctx.sink(); - } - - private static void copy(final Parser.GlobalContext gctx, - final Parser.ParserContext sourceCtx, final Parser.ParserContext destCtx, - final long srcBegin, final long srcEnd, - final long destBegin) { - TypeConverter.copy(sourceCtx.source(), destCtx.sink(), - srcBegin, srcEnd, destBegin, - sourceCtx.valueChunk(), destCtx.valueChunk(), gctx.nullChunk()); - } - - - private static void fillNulls(final Parser.GlobalContext gctx, - final Parser.ParserContext pctx, - final long begin, final long end) { - if (begin == end) { - return; - } - final boolean[] nullBuffer = gctx.nullChunk(); - final Sink destSink = pctx.sink(); - final TARRAY values = pctx.valueChunk(); - - final int sizeToInit = Math.min(nullBuffer.length, Math.toIntExact(end - begin)); - Arrays.fill(nullBuffer, 0, sizeToInit, true); - - for (long current = begin; current != end;) { // no ++ - final long endToUse = Math.min(current + nullBuffer.length, end); - // Don't care about the actual values, only the null flag values (which are all true). - destSink.write(values, nullBuffer, current, endToUse, false); - current = endToUse; - } - } - - private static List limitToSpecified(Collection items, Set limitTo) { - final List result = new ArrayList<>(); - for (T item : items) { - if (limitTo.contains(item)) { - result.add(item); - } - } - return result; - } - - private static class CategorizedParsers { - public static CategorizedParsers create(final Collection> parsers) throws CsvReaderException { - Parser booleanParser = null; - final Set> specifiedNumericParsers = new HashSet<>(); - // Subset of the above. - final List> specifiedFloatingPointParsers = new ArrayList<>(); - Parser dateTimeParser = null; - final Set> specifiedCharAndStringParsers = new HashSet<>(); - final List> specifiedTimeStampParsers = new ArrayList<>(); - final List> specifiedCustomParsers = new ArrayList<>(); - for (Parser p : parsers) { - if (p == Parsers.BYTE || p == Parsers.SHORT || p == Parsers.INT || p == Parsers.LONG) { - specifiedNumericParsers.add(p); - continue; - } - - if (p == Parsers.FLOAT_FAST || p == Parsers.FLOAT_STRICT || p == Parsers.DOUBLE) { - specifiedNumericParsers.add(p); - specifiedFloatingPointParsers.add(p); - continue; - } - - if (p == Parsers.TIMESTAMP_SECONDS || p == Parsers.TIMESTAMP_MILLIS || p == Parsers.TIMESTAMP_MICROS - || p == Parsers.TIMESTAMP_NANOS) { - specifiedTimeStampParsers.add(p); - continue; - } - - if (p == Parsers.CHAR || p == Parsers.STRING) { - specifiedCharAndStringParsers.add(p); - continue; - } - - if (p == Parsers.BOOLEAN) { - booleanParser = p; - continue; - } - - if (p == Parsers.DATETIME) { - dateTimeParser = p; - continue; - } - - specifiedCustomParsers.add(p); - } - - if (specifiedFloatingPointParsers.size() > 1) { - throw new CsvReaderException("There is more than one floating point parser in the parser set."); - } - - if (specifiedTimeStampParsers.size() > 1) { - throw new CsvReaderException("There is more than one timestamp parser in the parser set."); - } - - if (specifiedCustomParsers.size() > 1) { - throw new CsvReaderException("There is more than one custom parser in the parser set."); - } - - if (!specifiedCustomParsers.isEmpty() && parsers.size() != 1) { - throw new CsvReaderException( - "When a custom parser is specified, it must be the only parser in the set."); - } - - if (!specifiedNumericParsers.isEmpty() && !specifiedTimeStampParsers.isEmpty()) { - throw new CsvReaderException("The parser set must not contain both numeric and timestamp parsers."); - } - - final List> allNumericParsersByPrecedence = List.of( - Parsers.BYTE, Parsers.SHORT, Parsers.INT, Parsers.LONG, Parsers.FLOAT_FAST, Parsers.FLOAT_STRICT, - Parsers.DOUBLE); - final List> allCharAndStringParsersByPrecedence = List.of( - Parsers.CHAR, Parsers.STRING); - - final List> numericParsers = - limitToSpecified(allNumericParsersByPrecedence, specifiedNumericParsers); - final List> charAndStringParsers = - limitToSpecified(allCharAndStringParsersByPrecedence, specifiedCharAndStringParsers); - final Parser timestampParser = - specifiedTimeStampParsers.isEmpty() ? null : specifiedTimeStampParsers.get(0); - final Parser customParser = specifiedCustomParsers.isEmpty() ? null : specifiedCustomParsers.get(0); - - return new CategorizedParsers(booleanParser, numericParsers, dateTimeParser, - charAndStringParsers, timestampParser, customParser); - } - - private final Parser booleanParser; - private final List> numericParsers; - private final Parser dateTimeParser; - private final List> charAndStringParsers; - private final Parser timestampParser; - private final Parser customParser; - - private CategorizedParsers(Parser booleanParser, List> numericParsers, Parser dateTimeParser, - List> charAndStringParsers, Parser timestampParser, Parser customParser) { - this.booleanParser = booleanParser; - this.numericParsers = numericParsers; - this.dateTimeParser = dateTimeParser; - this.charAndStringParsers = charAndStringParsers; - this.timestampParser = timestampParser; - this.customParser = customParser; - } - } - - private static class ParserResultWrapper { - private final Parser.ParserContext pctx; - private final long begin; - private final long end; - - public ParserResultWrapper(Parser.ParserContext pctx, long begin, long end) { - this.pctx = pctx; - this.begin = begin; - this.end = end; - } - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseInputToDenseStorage.java b/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseInputToDenseStorage.java deleted file mode 100644 index d5679331976..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/reading/ParseInputToDenseStorage.java +++ /dev/null @@ -1,139 +0,0 @@ -package io.deephaven.csv.reading; - -import io.deephaven.csv.containers.ByteSlice; -import io.deephaven.csv.densestorage.DenseStorageReader; -import io.deephaven.csv.densestorage.DenseStorageWriter; -import io.deephaven.csv.util.CsvReaderException; -import org.apache.commons.lang3.mutable.MutableBoolean; - -import java.nio.charset.StandardCharsets; - -/** - * The job of this class is to take the input text, parse the CSV format (dealing with quoting, escaping, field - * delimiters, and line delimiters) in order to break it into columns of cells (where a cell just contains uninterpreted - * text... we haven't yet tried to parse into types yet), and to feed each of those columns of cells into its own - * {@link DenseStorageWriter}. On the reading side, there is a {@link DenseStorageReader} paired with every - * {@link DenseStorageWriter} and its job is to pull the data back out and have it processed by the - * {@link ParseDenseStorageToColumn} class. The job of that class is to do pick the most appropriate parser, typically - * by doing type inference, and parse the text into typed data. The reason for all this separation is that the - * {@link DenseStorageReader} and {@link ParseDenseStorageToColumn} classes can run concurrently for each column. - */ -public class ParseInputToDenseStorage { - /** - * Take cell text (parsed by the {@link CellGrabber}), and feed them to the various {@link DenseStorageWriter} - * classes. - * - * @param optionalFirstDataRow If not null, this is the first row of data from the file, which the caller had to - * peek at in order to know the number of columns in the file. - * @param nullValueLiteral The configured null value literal. This is used for providing the null value literal to - * the downstream processing code (namely the {@link ParseDenseStorageToColumn} code). - * @param grabber The {@link CellGrabber} which does all the CSV format handling (delimiters, quotes, etc). - * @param dsws The array of {@link DenseStorageWriter}s, one for each column. As a special case, if a given - * {@link DenseStorageWriter} is null, then instead of passing data to it, we confirm that the data is the - * empty string and then just drop the data. This is used to handle input files that have a trailing empty - * column on the right. - * @return The number of data rows in the input (i.e. not including headers or strings split across multiple lines). - */ - public static long doit(final byte[][] optionalFirstDataRow, final String nullValueLiteral, - final CellGrabber grabber, - final DenseStorageWriter[] dsws) throws CsvReaderException { - final ByteSlice slice = new ByteSlice(); - final int numCols = dsws.length; - - // If a "short row" is encountered (one with a fewer-than-expected number of columns) we will treat - // it as if the missing cells contained the nullValueLiteral. - final byte[] nullValueBytes = nullValueLiteral.getBytes(StandardCharsets.UTF_8); - final ByteSlice nullSlice = new ByteSlice(nullValueBytes, 0, nullValueBytes.length); - - // This is the number of data rows read. - long logicalRowNum = 0; - - // There is a case (namely when the file has no headers and the client hasn't specified - // them either) where the CsvReader was forced to read the first row of data from the file - // in order to determine the number of columns. If this happened, optionalFirstDataRow will - // be non-null and we can process it as data here. Then the rest of the processing can - // proceed as normal. - if (optionalFirstDataRow != null) { - if (optionalFirstDataRow.length != numCols) { - throw new CsvReaderException(String.format("Expected %d columns but optionalFirstRow had %d", - numCols, optionalFirstDataRow.length)); - } - for (int ii = 0; ii < optionalFirstDataRow.length; ++ii) { - final byte[] temp = optionalFirstDataRow[ii]; - slice.reset(temp, 0, temp.length); - appendToDenseStorageWriter(dsws[ii], slice); - } - ++logicalRowNum; - } - - // Grab the remaining lines and store them. - // The outer while is the "row" iteration. - final MutableBoolean lastInRow = new MutableBoolean(); - OUTER: while (true) { - // As we start processing the next data row, grab the row number from the CellGrabber. This number refers - // to the (zero-based) "physical" row number of the file. Now is a logical time to grab that number, because - // a "logical" data row may span multiple "physical" rows, and if we have to report an error to the caller, - // it's clearest if we record the physical row number where the logical row started. - final long physicalRowNum = grabber.physicalRowNum(); - - // Zero-based column number. - int colNum = 0; - - try { - // The inner while is the "column" iteration - while (true) { - if (!grabber.grabNext(slice, lastInRow)) { - if (colNum == 0) { - break OUTER; - } - // Can't get here. If there is any data at all in the last row, and *then* the file ends, - // grabNext() will return true, with lastInRow set. - throw new RuntimeException("Logic error: uncaught short last row"); - } - appendToDenseStorageWriter(dsws[colNum], slice); - ++colNum; - if (colNum == numCols) { - if (!lastInRow.booleanValue()) { - throw new CsvReaderException( - String.format("Row %d has too many columns (expected %d)", physicalRowNum + 1, - numCols)); - } - break; - } - if (lastInRow.booleanValue()) { - // Short rows are padded with null - while (colNum != numCols) { - appendToDenseStorageWriter(dsws[colNum], nullSlice); - ++colNum; - } - break; - } - } - } catch (Exception e) { - final String message = String.format("While processing row %d, column %d:", - physicalRowNum + 1, colNum + 1); - throw new CsvReaderException(message, e); - } - ++logicalRowNum; - } - for (DenseStorageWriter dsw : dsws) { - if (dsw != null) { - dsw.finish(); - } - } - - return logicalRowNum; - } - - private static void appendToDenseStorageWriter(final DenseStorageWriter dsw, final ByteSlice bs) - throws CsvReaderException { - if (dsw != null) { - dsw.append(bs); - return; - } - if (bs.size() != 0) { - throw new CsvReaderException("Column assumed empty but contains data"); - } - - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/reading/TypeConverter.java b/extensions/csv/src/main/java/io/deephaven/csv/reading/TypeConverter.java deleted file mode 100644 index c198a69f42f..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/reading/TypeConverter.java +++ /dev/null @@ -1,229 +0,0 @@ -package io.deephaven.csv.reading; - -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.sinks.Source; - -import java.lang.reflect.Array; -import java.util.function.BiConsumer; - -public class TypeConverter { - private static final int BYTE = 1; - private static final int SHORT = 2; - private static final int INT = 3; - private static final int LONG = 4; - private static final int FLOAT = 5; - private static final int DOUBLE = 6; - - public static void copy(final Source source, final Sink dest, final long srcBegin, - final long srcEnd, - final long destBegin, final TARRAY srcChunk, final UARRAY destChunk, final boolean[] isNull) { - if (srcBegin == srcEnd) { - return; - } - final int srcChunkSize = Array.getLength(srcChunk); - final int destChunkSize = Array.getLength(destChunk); - final int isNullChunkSize = isNull.length; - if (srcChunkSize != destChunkSize || srcChunkSize != isNullChunkSize) { - final String message = String.format("Logic error: chunk sizes differ: %d vs %d vs %d", - srcChunkSize, destChunkSize, isNullChunkSize); - throw new RuntimeException(message); - } - - final CopyOperation performCopy = getChunkCopierFor(srcChunk.getClass(), destChunk.getClass()); - - long srcCurrent = srcBegin; - long destCurrent = destBegin; - while (srcCurrent != srcEnd) { - final long srcEndToUse = Math.min(srcCurrent + srcChunkSize, srcEnd); - final int copySize = Math.toIntExact(srcEndToUse - srcCurrent); - final long destEndToUse = destCurrent + copySize; - source.read(srcChunk, isNull, srcCurrent, srcEndToUse); - performCopy.accept(srcChunk, destChunk, copySize); - dest.write(destChunk, isNull, destCurrent, destEndToUse, false); - srcCurrent = srcEndToUse; - destCurrent = destEndToUse; - } - } - - private static CopyOperation getChunkCopierFor(final Class srcClass, final Class destClass) { - final int srcType = identify(srcClass); - final int sinkType = identify(destClass); - switch (srcType * 100 + sinkType) { - case BYTE * 100 + SHORT: - return TypeConverter::copyByteToShort; - case BYTE * 100 + INT: - return TypeConverter::copyByteToInt; - case BYTE * 100 + LONG: - return TypeConverter::copyByteToLong; - case BYTE * 100 + FLOAT: - return TypeConverter::copyByteToFloat; - case BYTE * 100 + DOUBLE: - return TypeConverter::copyByteToDouble; - - case SHORT * 100 + INT: - return TypeConverter::copyShortToInt; - case SHORT * 100 + LONG: - return TypeConverter::copyShortToLong; - case SHORT * 100 + FLOAT: - return TypeConverter::copyShortToFloat; - case SHORT * 100 + DOUBLE: - return TypeConverter::copyShortToDouble; - - case INT * 100 + LONG: - return TypeConverter::copyIntToLong; - case INT * 100 + FLOAT: - return TypeConverter::copyIntToFloat; - case INT * 100 + DOUBLE: - return TypeConverter::copyIntToDouble; - - case LONG * 100 + FLOAT: - return TypeConverter::copyLongToFloat; - case LONG * 100 + DOUBLE: - return TypeConverter::copyLongToDouble; - - default: { - final String message = String.format("Logic error: don't have a converter from %s to %s", - srcClass.getCanonicalName(), destClass.getCanonicalName()); - throw new RuntimeException(message); - } - } - } - - private static int identify(final Class chunkClass) { - if (chunkClass == byte[].class) { - return BYTE; - } - if (chunkClass == short[].class) { - return SHORT; - } - if (chunkClass == int[].class) { - return INT; - } - if (chunkClass == long[].class) { - return LONG; - } - if (chunkClass == float[].class) { - return FLOAT; - } - if (chunkClass == double[].class) { - return DOUBLE; - } - throw new RuntimeException("Unsupported chunk type " + chunkClass.getCanonicalName()); - } - - private static void copyByteToShort(final Object src, final Object dest, final int size) { - final byte[] srcTyped = (byte[]) src; - final short[] destTyped = (short[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyByteToInt(final Object src, final Object dest, final int size) { - final byte[] srcTyped = (byte[]) src; - final int[] destTyped = (int[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyByteToLong(final Object src, final Object dest, final int size) { - final byte[] srcTyped = (byte[]) src; - final long[] destTyped = (long[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyByteToFloat(final Object src, final Object dest, final int size) { - final byte[] srcTyped = (byte[]) src; - final float[] destTyped = (float[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyByteToDouble(final Object src, final Object dest, final int size) { - final byte[] srcTyped = (byte[]) src; - final double[] destTyped = (double[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyShortToInt(final Object src, final Object dest, final int size) { - final short[] srcTyped = (short[]) src; - final int[] destTyped = (int[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyShortToLong(final Object src, final Object dest, final int size) { - final short[] srcTyped = (short[]) src; - final long[] destTyped = (long[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyShortToFloat(final Object src, final Object dest, final int size) { - final short[] srcTyped = (short[]) src; - final float[] destTyped = (float[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyShortToDouble(final Object src, final Object dest, final int size) { - final short[] srcTyped = (short[]) src; - final double[] destTyped = (double[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyIntToLong(final Object src, final Object dest, final int size) { - final int[] srcTyped = (int[]) src; - final long[] destTyped = (long[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyIntToFloat(final Object src, final Object dest, final int size) { - final int[] srcTyped = (int[]) src; - final float[] destTyped = (float[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyIntToDouble(final Object src, final Object dest, final int size) { - final int[] srcTyped = (int[]) src; - final double[] destTyped = (double[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyLongToFloat(final Object src, final Object dest, final int size) { - final long[] srcTyped = (long[]) src; - final float[] destTyped = (float[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private static void copyLongToDouble(final Object src, final Object dest, final int size) { - final long[] srcTyped = (long[]) src; - final double[] destTyped = (double[]) dest; - for (int ii = 0; ii < size; ++ii) { - destTyped[ii] = srcTyped[ii]; - } - } - - private interface CopyOperation { - void accept(Object src, Object dest, int size); - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/sinks/Sink.java b/extensions/csv/src/main/java/io/deephaven/csv/sinks/Sink.java deleted file mode 100644 index 181f9e70a9a..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/sinks/Sink.java +++ /dev/null @@ -1,56 +0,0 @@ -package io.deephaven.csv.sinks; - -/** - * The system uses this interface to write to caller's column data structures. The reason this interface exists is so - * that the caller can use whatever column data structure they want to for final storage of the data. - * - * @param The array data type (e.g. short[], double[], etc.) holding a chunk of data to be written to the - * target data structure. The caller specifies what Sinks to use via the {@link SinkFactory} class. - */ -public interface Sink { - /** - * Write a chunk of data to the target data structure. Sample implementation:
    -     *     if (appending) {
    -     *         while (mycolumn.size() < destBegin) {
    -     *             myColumn.addNull();
    -     *         }
    -     *         int srcIndex = 0;
    -     *         for (long destIndex = destBegin; destIndex != destEnd; ++destIndex) {
    -     *             if (isNull[srcIndex]) {
    -     *                 myColumn.addNull();
    -     *             } else {
    -     *                 myColumn.add(src[srcIndex]);
    -     *             }
    -     *             ++srcIndex;
    -     *         }
    -     *     } else {
    -     *         // replacing
    -     *         int srcIndex = 0;
    -     *         for (long destIndex = destBegin; destIndex != destEnd; ++destIndex) {
    -     *             if (isNull[srcIndex]) {
    -     *                 myColumn[destIndex] = myNullRepresentation;
    -     *             } else {
    -     *                 myColumn[destIndex] = src[srcIndex];
    -     *             }
    -     *             ++srcIndex;
    -     *         }
    -     *
    -     *     }
    -     *     
    - * - * @param src The chunk of data, a typed array (short[], double[], etc) with valid elements in the half-open - * interval {@code [0..(destEnd - destBegin))}. - * @param isNull A boolean array, with the same range of valid elements. A "true" value at position {@code i} means - * that {@code src[i]} should be ignored and the element should be considered as the "null value", whose - * representation depends on the target data structure. A "false" value means that {@code src[i]} should be - * interpreted normally. - * @param destBegin The inclusive start index of the destination range. - * @param destEnd The exclusive end index of the destination range. - * @param appending A hint to the destination which indicates whether the system is appending to the data structure - * (if appending is true), or overwriting previously-written values (if appending is false). The caller - * promises to never span these two cases: i.e. it will never pass a chunk of data which partially overwrites - * values and then partially appends values. This flag is convenient but technically redundant because code - * can also determine what case it's in by comparing {@code destEnd} to the data structure's current size. - */ - void write(final TARRAY src, final boolean[] isNull, final long destBegin, final long destEnd, boolean appending); -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/sinks/SinkFactory.java b/extensions/csv/src/main/java/io/deephaven/csv/sinks/SinkFactory.java deleted file mode 100644 index ae13a3618fb..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/sinks/SinkFactory.java +++ /dev/null @@ -1,263 +0,0 @@ -package io.deephaven.csv.sinks; - -import org.apache.commons.lang3.mutable.Mutable; -import org.apache.commons.lang3.mutable.MutableObject; - -import java.util.function.Supplier; - -/** - * An interface which allows the CsvReader to write to column data structures whose details it is unaware of. Using this - * interface, the caller provides factory methods that make a Sink<TARRAY> for the corresponding data type. The - * integral parsers (byte, short, int, long) also provide a Source<TARRAY> via an out parameter, because the - * inference algorithm wants a fast path for reading back data it has already written. This is used in the case where - * the algorithm makes some forward progress on a numeric type but then decides to fall back to a wider numeric type. - * The system also supports more general kinds of fallback (e.g. from int to string), but in cases like that the system - * just reparses the original input text rather than asking the collection to read the data back. - * - * For example, if the system has parsed N shorts for a given column and then encounters an int value that doesn't fit - * in a short (or, alternatively, it encounters a reserved short and needs to reject it), it will read back the shorts - * already written and write them to an integer sink instead. - * - * The methods allow the caller to specify "reserved" values for types where it makes sense to have one. If a reserved - * value is encountered, the type inference process will move to the next wider type and try again. In typical practice - * this is used in systems that have a reserved sentinel value that represents null. For example, for a byte column, a - * system might reserve the value ((byte)-128) to represent the null byte, yet allow ((short)-128) to be a permissible - * short value. Likewise a system might reserve the value ((short)-32768) to represent the null short, but allow - * ((int)-32768) to be a permissible int value. - */ -public interface SinkFactory { - static & Source, TSHORTSINK extends Sink & Source, TINTSINK extends Sink & Source, TLONGSINK extends Sink & Source> SinkFactory of( - Supplier byteSinkSupplier, Byte reservedByte, - Supplier shortSinkSupplier, Short reservedShort, - Supplier intSinkSupplier, Integer reservedInt, - Supplier longSinkSupplier, Long reservedLong, - Supplier> floatSinkSupplier, Float reservedFloat, - Supplier> doubleSinkSupplier, Double reservedDouble, - Supplier> booleanAsByteSinkSupplier, // no Byte reservedBooleanAsByte, - Supplier> charSinkSupplier, Character reservedChar, - Supplier> stringSinkSupplier, String reservedString, - Supplier> dateTimeAsLongSinkSupplier, Long reservedDateTimeAsLong, - Supplier> timestampAsLongSinkSupplier, Long reservedTimestampAsLong) { - return new SinkFactory() { - @Override - public Sink forByte(MutableObject> source) { - final TBYTESINK result = byteSinkSupplier.get(); - source.setValue(result); - return result; - } - - @Override - public Byte reservedByte() { - return reservedByte; - } - - @Override - public Sink forShort(MutableObject> source) { - final TSHORTSINK result = shortSinkSupplier.get(); - source.setValue(result); - return result; - } - - @Override - public Short reservedShort() { - return reservedShort; - } - - @Override - public Sink forInt(MutableObject> source) { - final TINTSINK result = intSinkSupplier.get(); - source.setValue(result); - return result; - } - - @Override - public Integer reservedInt() { - return reservedInt; - } - - @Override - public Sink forLong(MutableObject> source) { - final TLONGSINK result = longSinkSupplier.get(); - source.setValue(result); - return result; - } - - @Override - public Long reservedLong() { - return reservedLong; - } - - @Override - public Sink forFloat() { - return floatSinkSupplier.get(); - } - - @Override - public Float reservedFloat() { - return reservedFloat; - } - - @Override - public Sink forDouble() { - return doubleSinkSupplier.get(); - } - - @Override - public Double reservedDouble() { - return reservedDouble; - } - - @Override - public Sink forBooleanAsByte() { - return booleanAsByteSinkSupplier.get(); - } - - @Override - public Sink forChar() { - return charSinkSupplier.get(); - } - - @Override - public Character reservedChar() { - return reservedChar; - } - - @Override - public Sink forString() { - return stringSinkSupplier.get(); - } - - @Override - public String reservedString() { - return reservedString; - } - - @Override - public Sink forDateTimeAsLong() { - return dateTimeAsLongSinkSupplier.get(); - } - - @Override - public Long reservedDateTimeAsLong() { - return reservedDateTimeAsLong; - } - - @Override - public Sink forTimestampAsLong() { - return timestampAsLongSinkSupplier.get(); - } - - @Override - public Long reservedTimestampAsLong() { - return reservedTimestampAsLong; - } - }; - } - - /** - * Provide a Sink and a Source for the byte representation. - */ - Sink forByte(MutableObject> source); - - /** - * The optional reserved value for the byte representation. - */ - Byte reservedByte(); - - /** - * Provide a Sink and a Source for the short representation. - */ - Sink forShort(MutableObject> source); - - /** - * The optional reserved value for the short representation. - */ - Short reservedShort(); - - /** - * Provide a Sink and a Source for the int representation. - */ - Sink forInt(MutableObject> source); - - /** - * The optional reserved value for the int representation. - */ - Integer reservedInt(); - - /** - * Provide a Sink and a Source for the long representation. - */ - Sink forLong(MutableObject> source); - - /** - * The optional reserved value for the long representation. - */ - Long reservedLong(); - - /** - * Provide a Sink for the float representation. - */ - Sink forFloat(); - - /** - * The optional reserved value for the float representation. - */ - Float reservedFloat(); - - /** - * Provide a Sink for the double representation. - */ - Sink forDouble(); - - /** - * The optional reserved value for the double representation. - */ - Double reservedDouble(); - - /** - * Provide a Sink for the boolean (as byte) representation. - */ - Sink forBooleanAsByte(); - - // there is no reserved value for the boolean as byte representation, as none is needed. - - /** - * Provide a Sink for the char representation. - */ - Sink forChar(); - - /** - * The optional reserved value for the char representation. - */ - Character reservedChar(); - - /** - * Provide a Sink for the String representation. - */ - Sink forString(); - - /** - * The optional reserved value for the String representation. - */ - String reservedString(); - - /** - * Provide a Sink for the DateTime (as long) representation. - */ - Sink forDateTimeAsLong(); - - /** - * The optional reserved value for the DateTime (as long) representation. - */ - Long reservedDateTimeAsLong(); - - /** - * Provide a Sink for the Timestamp (as long) representation. - */ - Sink forTimestampAsLong(); - - /** - * The optional reserved value for the Timestamp (as long) representation. - */ - Long reservedTimestampAsLong(); -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/sinks/Source.java b/extensions/csv/src/main/java/io/deephaven/csv/sinks/Source.java deleted file mode 100644 index c84545df9da..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/sinks/Source.java +++ /dev/null @@ -1,35 +0,0 @@ -package io.deephaven.csv.sinks; - -/** - * The system uses this interface to read from the caller's column data structures. The system only needs to do so in a - * limited number of cases, namely TARRAY = byte[], short[], int[], and long[]. This interface is used when the type - * inference process guesses wrong and needs a fast path to read the data back from a narrower data structure and write - * it to a wider one. - * - * @param The array data type (e.g. short[], int[], etc.) holding a chunk of data to be copied from the target - * data structure. - */ -public interface Source { - /** - * Read a chunk of data from the src data structure. Sample implementation:
    -     *     int destIndex = 0;
    -     *     for (long srcIndex = srcBegin; srcIndex != srcEnd; ++srcIndex) {
    -     *         if (myColumn.hasNullAt(srcIndex)) {
    -     *             isNull[destIndex] = true;
    -     *         } else {
    -     *             dest[destIndex] = myColumn.getValueAt(srcIndex);
    -     *             isNull[destIndex] = false;
    -     *         }
    -     *         ++destIndex;
    -     *     }
    -     *     
    - * - * @param dest The chunk of data used to store values copied from the caller's column data structure. - * @param isNull A boolean array, with the same range of valid elements. A "true" value at position {@code i} means - * the the corresponding element refers to the "null value" of the source data structure. A "false" value - * means that {@code dest[i]} should be interpreted normally. - * @param srcBegin The inclusive start index of the source range. - * @param srcEnd The exclusive end index of the source range. - */ - void read(final TARRAY dest, final boolean[] isNull, final long srcBegin, final long srcEnd); -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/tokenization/RangeTests.java b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/RangeTests.java deleted file mode 100644 index 5329517cc72..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/tokenization/RangeTests.java +++ /dev/null @@ -1,115 +0,0 @@ -package io.deephaven.csv.tokenization; - -/** - * Simple range tests that may be faster than the corresponding Java utilities because they are ASCII-specific. - */ -public class RangeTests { - /** - * If the character is lowercase ASCII, converts it to uppercase ASCII. Otherwise leaves it alone. - * - * @param ch The character. - * @return The converted or unchanged character. - */ - public static char toUpper(char ch) { - return isLower(ch) ? (char) (ch - 'a' + 'A') : ch; - } - - /** - * Is the character uppercase ASCII? - * - * @param ch The character. - * @return True if the character is uppercase ASCII. False otherwise. - */ - public static boolean isUpper(char ch) { - return ch >= 'A' && ch <= 'Z'; - } - - /** - * Is the character lowercase ASCII? - * - * @param ch The character. - * @return True if the character is lowercase ASCII. False otherwise. - */ - public static boolean isLower(char ch) { - return ch >= 'a' && ch <= 'z'; - } - - /** - * Is the character an ASCII digit? - * - * @param ch The character. - * @return True if the character is an ASCII digit. False otherwise. - */ - public static boolean isDigit(char ch) { - return ch >= '0' && ch <= '9'; - } - - /** - * Is the character space or tab? - * - * @param ch The character. - * @return True if the character is space or tab. False otherwise. - */ - public static boolean isSpaceOrTab(byte ch) { - return ch == ' ' || ch == '\t'; - } - - /** - * Is the value in range for a Java byte? - * - * @param value The value. - * @return True if the value is in range for a Java byte. False otherwise. - */ - public static boolean isInRangeForByte(long value) { - return value >= Byte.MIN_VALUE && value <= Byte.MAX_VALUE; - } - - /** - * Is the value in range for a Java short? - * - * @param value The value. - * @return True if the value is in range for a Java short. False otherwise. - */ - public static boolean isInRangeForShort(long value) { - return value >= Short.MIN_VALUE && value <= Short.MAX_VALUE; - } - - /** - * Is the value in range for a Java int? - * - * @param value The value. - * @return True if the value is in range for a Java int. False otherwise. - */ - public static boolean isInRangeForInt(long value) { - return value >= Integer.MIN_VALUE && value <= Integer.MAX_VALUE; - } - - /** - * Is the value in range for a Java float? - * - * @param value The value. - * @return True if the value is in range for a Java float. False otherwise. - */ - public static boolean isInRangeForFloat(double value) { - return Double.isNaN(value) || - Double.isInfinite(value) || - (value >= -Float.MAX_VALUE && value <= Float.MAX_VALUE); - } - - /** - * Are all the characters in byte slice ASCII? - * - * @param data The character data. - * @param begin The inclusive start of the slice. - * @param end The exclusive end of the slice. - * @return True if all the characters are ASCII, false otherwise. - */ - public static boolean isAscii(final byte[] data, final int begin, final int end) { - for (int cur = begin; cur != end; ++cur) { - if (data[cur] < 0) { - return false; - } - } - return true; - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/tokenization/Tokenizer.java b/extensions/csv/src/main/java/io/deephaven/csv/tokenization/Tokenizer.java deleted file mode 100644 index 919d2069e88..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/tokenization/Tokenizer.java +++ /dev/null @@ -1,690 +0,0 @@ -package io.deephaven.csv.tokenization; - -import io.deephaven.csv.containers.ByteSlice; -import org.apache.commons.lang3.mutable.*; -import ch.randelshofer.fastdoubleparser.FastDoubleParserFromByteArray; - -import java.time.*; - -/** - * This class provides a variety of methods to efficiently parse various low-level types like booleans, longs, doubles, - * and datetimes. - */ -public class Tokenizer { - /** - * An optional custom time zone parser. Used for clients (such as Deephaven itself) who support custom time zone - * formats. - */ - private final CustomTimeZoneParser customTimeZoneParser; - /** - * Storage for a temporary "out" variable owned by tryParseDateTime. - */ - private final MutableLong dateTimeTemp0 = new MutableLong(); - /** - * Storage for a temporary "out" variable owned by tryParseDateTime. - */ - private final MutableLong dateTimeTemp1 = new MutableLong(); - /** - * Storage for a temporary "out" variable owned by tryParseDateTime. - */ - private final MutableLong dateTimeTemp2 = new MutableLong(); - /** - * Storage for a temporary "out" variable owned by tryParseDateTime. - */ - private final MutableObject dateTimeTempZoneId = new MutableObject<>(); - /** - * Storage for a temporary "out" variable owned by tryParseDateTime. - */ - private final MutableBoolean dateTimeTempBoolean = new MutableBoolean(); - - public Tokenizer(CustomTimeZoneParser customTimeZoneParser) { - this.customTimeZoneParser = customTimeZoneParser; - } - - /** - * Try to parse the input as a boolean. - * - * @param bs The input text. This slice is *NOT* modified, regardless of success or failure. - * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. - * @return true if the input was successfully parsed. Otherwise, false. - */ - public boolean tryParseBoolean(final ByteSlice bs, final MutableBoolean result) { - final int savedBegin = bs.begin(); - final int savedEnd = bs.end(); - Mutating.trim(bs); - // Successful if parse was successful AND input was completely consumed. - final boolean success = Mutating.tryParseBoolean(bs, result) && bs.begin() == bs.end(); - bs.setBegin(savedBegin); - bs.setEnd(savedEnd); - return success; - } - - /** - * Try to parse the input as a single character in Unicode's Basic Multilingual Plane. This means the character will - * fit in a single Java "char" without requiring UTF-16 surrogate pairs. Unicode characters that meet this criterion - * are either in the range U+0000 through U+D7FF, or the range U+E000 through U+FFFF. - * - * @param bs The input text. This slice is *NOT* modified, regardless of success or failure. - * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. - * @return true if the input was successfully parsed. Otherwise, false. The return value is provided in a - * {@link MutableInt} because Apache doesn't provide a MutableChar. - */ - public boolean tryParseBMPChar(final ByteSlice bs, final MutableInt result) { - final byte[] d = bs.data(); - int o = bs.begin(); - final int end = bs.end(); - if (o == end) { - return false; - } - final int first = byteToInt(d[o++]); - final int moreExpected; - int value; - if ((first & 0x80) == 0) { - // 0xxxxxxx - // 1-byte UTF-8 character aka ASCII. - // Last code point U+007F - value = first & 0x7F; - result.setValue(value); - return o == end; - } - if ((first & 0xE0) == 0xC0) { - // 110xxxxx - // 2-byte UTF-8 character - // Last code point U+07FF - value = first & 0x1F; - moreExpected = 1; - } else if ((first & 0xF0) == 0xE0) { - // 1110xxxx - // 3-byte UTF-8 character - // Last code point U+FFFF - value = first & 0x0F; - moreExpected = 2; - } else { - // 11110xxx - // 4-byte UTF-8 character - // This would take us into U+10000 territory, so we reject it. - return false; - } - - for (int ii = 0; ii < moreExpected; ++ii) { - if (o == end) { - return false; - } - final int next = byteToInt(d[o++]); - if ((next & 0xc0) != 0x80) { - // bad UTF-8 actually. - return false; - } - value = (value << 6) | (next & 0x3F); - } - - result.setValue(value); - return true; - } - - private static int byteToInt(byte b) { - return b >= 0 ? b : 256 + b; - } - - /** - * Try to parse the input as a long. - * - * @param bs The input text. This slice is *NOT* modified, regardless of success or failure. - * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. - * @return true if the input was successfully parsed. Otherwise, false. - */ - public boolean tryParseLong(final ByteSlice bs, final MutableLong result) { - final int savedBegin = bs.begin(); - final int savedEnd = bs.end(); - Mutating.trim(bs); - // Successful if parse was successful AND input was completely consumed. - final boolean success = Mutating.tryParseLong(bs, result) && bs.begin() == bs.end(); - bs.setBegin(savedBegin); - bs.setEnd(savedEnd); - return success; - } - - /** - * Try to parse the input as a float, using {@link Float#parseFloat}. Most code will prefer to use - * {@link Tokenizer#tryParseDouble} because it is much faster. This method exists for callers who want the exact - * semantics of Java's {@link Float#parseFloat} and are willing to pay the performance cost (both of the toString() - * and of the slower parser). - * - * @param bs The input text. This slice is *NOT* modified, regardless of success or failure. - * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. - * @return true if {@code bs} was successfully parsed as a float. Otherwise, false. - */ - public boolean tryParseFloatStrict(final ByteSlice bs, final MutableFloat result) { - try { - final float res = Float.parseFloat(bs.toString()); - result.setValue(res); - return true; - } catch (NumberFormatException nfe) { - // Normally we would be pretty sad about throwing exceptions in the inner loops of our CSV parsing - // framework, but the fact of the matter is that the first exception thrown will cause the - // calling parser to punt to the next parser anyway, so the overall impact is negligible. - return false; - } - } - - /** - * Try to parse the input as a double. - * - * @param bs The input text. This slice is *NOT* modified, regardless of success or failure. - * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. - * @return true if {@code bs} was successfully parsed as a double. Otherwise, false. - */ - public boolean tryParseDouble(final ByteSlice bs, final MutableDouble result) { - // Our third-party double parser already checks for trailing garbage so we don't have to. - try { - final double res = FastDoubleParserFromByteArray.parseDouble(bs.data(), bs.begin(), bs.size()); - result.setValue(res); - return true; - } catch (NumberFormatException nfe) { - // Normally we would be pretty sad about throwing exceptions in the inner loops of our CSV parsing - // framework, but the fact of the matter is that the first exception thrown will cause the - // calling parser to punt to the next parser anyway, so the overall impact is negligible. - return false; - } - } - - /** - * Try to parse the input as a Deephaven DateTime value (represented as nanoseconds since the epoch). - * - * @param bs The input text. This slice is *NOT* modified, regardless of success or failure. - * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. - * @return true if {@code bs} was successfully parsed as a Deephaven DateTime. Otherwise, false. - */ - public boolean tryParseDateTime(final ByteSlice bs, final MutableLong result) { - final int savedBegin = bs.begin(); - // Successful if parse was successful AND input was completely consumed. - final boolean success = Mutating.tryParseDateTime(bs, customTimeZoneParser, - dateTimeTemp0, dateTimeTemp1, dateTimeTemp2, dateTimeTempBoolean, dateTimeTempZoneId, result) && - bs.begin() == bs.end(); - bs.setBegin(savedBegin); - return success; - } - - /** - * The methods in this utility class obey the following invariants: On success, they update their incoming ByteSlice - * to point to the end of the sequence. On failure, they leave it unchanged. - */ - private static final class Mutating { - /** - * Modify the input slice to remove leading and trailing whitespace, if any. - * - * @param bs Modified in place to remove leading and trailing whitespace, if any. - */ - public static void trim(final ByteSlice bs) { - while (bs.begin() != bs.end() && RangeTests.isSpaceOrTab(bs.front())) { - bs.setBegin(bs.begin() + 1); - } - while (bs.begin() != bs.end() && RangeTests.isSpaceOrTab(bs.back())) { - bs.setEnd(bs.end() - 1); - } - } - - /** - * If the slice is nonempty and its first character is {@code ch}, then eat the first character. - * - * @param bs If the method returns true, the slice is updated to remove the first character. Otherwise the slice - * is unmodified. - * @return true If the character was eaten, false otherwise. - */ - private static boolean tryEatChar(final ByteSlice bs, final char ch) { - if (bs.begin() == bs.end() || bs.front() != ch) { - return false; - } - bs.setBegin(bs.begin() + 1); - return true; - } - - /** - * Parse (a prefix of) the input as a boolean. - * - * @param bs If the method returns true, the slice is updated to remove the characters comprising the result. - * Otherwise, the slice is unmodified. - * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. - * @return true if the input was successfully parsed. Otherwise, false. - */ - public static boolean tryParseBoolean(final ByteSlice bs, final MutableBoolean result) { - final byte[] d = bs.data(); - final int o = bs.begin(); - final int bSize = bs.size(); - - if (bSize == 4) { - if ((d[o] == 't' || d[o] == 'T') && - (d[o + 1] == 'r' || d[o + 1] == 'R') && - (d[o + 2] == 'u' || d[o + 2] == 'U') && - (d[o + 3] == 'e' || d[o + 3] == 'E')) { - result.setValue(true); - bs.setBegin(bs.end()); - return true; - } - return false; - } - - if (bSize == 5) { - if ((d[o] == 'f' || d[o] == 'F') && - (d[o + 1] == 'a' || d[o + 1] == 'A') && - (d[o + 2] == 'l' || d[o + 2] == 'L') && - (d[o + 3] == 's' || d[o + 3] == 'S') && - (d[o + 4] == 'e' || d[o + 4] == 'E')) { - result.setValue(false); - bs.setBegin(bs.end()); - return true; - } - return false; - } - - return false; - } - - /** - * Parse (a prefix of) the input as a long. - * - * @param bs If the method returns true, the slice is updated to remove the characters comprising the result. - * Otherwise, the slice is unmodified. - * @param result Contains the parsed value if this method returns true. Otherwise, the contents are unspecified. - * @return true if the input was successfully parsed. Otherwise, false. - */ - public static boolean tryParseLong(final ByteSlice bs, final MutableLong result) { - final int savedBegin = bs.begin(); - if (bs.begin() == bs.end()) { - return false; - } - final char front = (char) bs.front(); - boolean negative = false; - if (front == '+') { - bs.setBegin(bs.begin() + 1); - } else if (front == '-') { - negative = true; - bs.setBegin(bs.begin() + 1); - } - if (!tryParseWholeNumber(bs, 1, 999, negative, result)) { - bs.setBegin(savedBegin); - return false; - } - return true; - } - - /** - * Parse (a prefix of) the input as a DateTime. Formats are largely ISO except we allow a pluggable timezone - * parser, used for example to support Deephaven-style time zones. - *

    - * Allowable formats: - *

    - *

      - *
    • 2021-11-07T09:00:00Z
    • - *
    • 2021-11-07T09:00:00.1Z
    • - *
    • 2021-11-07T09:00:00.12Z
    • - *
    • ...
    • - *
    • 2021-11-07T09:00:00.123456789Z
    • - *
    - * - *

    - * Hyphens and colons are optional (all in or all out). The 'T' can also be a space. The Z above is either the - * literal Z meaning UTC or some other text. If this character is not Z, the method will call out to a pluggable - * time zone parser to see if the text can be parsed as a time zone. In Deephaven this is used to parse - * Deephaven time zones like " NY", " MN", " ET", " UTC" etc. - * - *

    - * Allowable formats in UTC offset style (can be + or -): - *

    - * The offset can be hh or hh:mm or hhmm. - *

      - *
    • 2021-11-07T09:00:00+01
    • - *
    • 2021-11-07T09:00:00.1-02:30
    • - *
    • 2021-11-07T09:00:00.12+0300
    • - *
    • ...
    • - *
    • 2021-11-07T09:00:00.123456789+01:30
    • - *
    - * - * @param bs The input text. If the method returns true, the slice will be advanced past the parsed text. - * Otherwise (if the method returns false), the slice will be unchanged. - * @param temp0 A MutableLong for the method to use for temporary storage, so it doesn't have to allocate one. - * @param temp1 A MutableLong for the method to use for temporary storage, so it doesn't have to allocate one. - * @param temp2 A MutableLong for the method to use for temporary storage, so it doesn't have to allocate one. - * @param tempZoneId A MutableObject<ZoneId> for the method to use for temporary storage, so it doesn't - * have to allocate one. - * @param result The DateTime (in nanoseconds since the epoch) if the method returns true. Otherwise, the - * contents are unspecified. - * @return true if the input was successfully parsed. Otherwise, false. - */ - private static boolean tryParseDateTime(final ByteSlice bs, final CustomTimeZoneParser customTimeZoneParser, - final MutableLong temp0, final MutableLong temp1, final MutableLong temp2, - final MutableBoolean tempBoolean, final MutableObject tempZoneId, - final MutableLong result) { - // The logic proceeds as follows. - // First we have the required fields: - // yyyy - // - (optional, but if absent then no later hyphens or colons) - // mm - // - (optional, but presence or absence of punctuation needs to be consistent. Also we can stop here). - // dd - // T or space (or we stop here) - // hh - // : (optional, but presence or absence of punctuation needs to be consistent. Also we can stop here). - // MM - // : (optional, but presence or absence of punctuation needs to be consistent. Also we can stop here). - // SS - // . or , (optional, introduces fraction, must be followed by 1-9 decimal digits). - // Z or + or -: - // Z means UTC - // + or - means an offset follows, which itself is - // hh - // : (optional) - // mm (optional) - // Otherwise we call out to the pluggable time zone parser to see if it can parse a timezone out of the - // remaining text. - final int savedBegin = bs.begin(); - if (!tryParseYyyymmdd(bs, temp0, temp1, temp2, tempBoolean)) { - return false; - } - final int year = temp0.intValue(); - final int month = temp1.intValue(); - final int day = temp2.intValue(); - final boolean punctuationRequired = tempBoolean.booleanValue(); - - // Require 'T' or ' ' (per RFC 3339). - if (!tryEatChar(bs, 'T') && !tryEatChar(bs, ' ')) { - bs.setBegin(savedBegin); - return false; - } - - // Reusing result for temporary storage! - if (!tryParseHHmmssNanos(bs, punctuationRequired, temp0, temp1, temp2, result)) { - bs.setBegin(savedBegin); - return false; - } - final int hour = temp0.intValue(); - final int minute = temp1.intValue(); - final int second = temp2.intValue(); - final int nanos = result.intValue(); - - if (!tryParseIsoTimeZone(bs, tempZoneId, temp0) && - (customTimeZoneParser == null || !customTimeZoneParser.tryParse(bs, tempZoneId, temp0))) { - bs.setBegin(savedBegin); - return false; - } - final ZoneId zoneIdToUse = tempZoneId.getValue(); - final long secondsOffsetToUse = temp0.getValue(); - - final ZonedDateTime zdt = ZonedDateTime.of(year, month, day, hour, minute, second, 0, zoneIdToUse); - final long zdtSeconds = zdt.toEpochSecond(); - final long adjustedZdtSeconds = zdtSeconds + secondsOffsetToUse; - final long adjustedZdtNanos = adjustedZdtSeconds * 1_000_000_000L + nanos; - result.setValue(adjustedZdtNanos); - return true; - } - - /** - * Parse (a prefix of) the input as yyyyMMdd or yyyy-MM-dd. - * - * @param bs The input text. If the method returns true, the slice will be advanced past the parsed text. - * Otherwise (if the method returns false), the slice will be unchanged. - * @param yyyy Contains the parsed year if this method returns true. Otherwise, the contents are unspecified. - * @param mm Contains the parsed month if this method returns true. Otherwise, the contents are unspecified. - * @param dd Contains the parsed day if this method returns true. Otherwise, the contents are unspecified. - * @param hasPunctuation Contains whether hyphens were found in the input if this method returns true. - * Otherwise, the contents are unspecified. - * @return true if the input was successfully parsed. Otherwise, false. - */ - private static boolean tryParseYyyymmdd(final ByteSlice bs, final MutableLong yyyy, - final MutableLong mm, final MutableLong dd, final MutableBoolean hasPunctuation) { - final int savedBegin = bs.begin(); - if (!tryParseWholeNumber(bs, 4, 4, false, yyyy)) { - return false; - } - - hasPunctuation.setValue(Mutating.tryEatChar(bs, '-')); - - if (!tryParseWholeNumber(bs, 2, 2, false, mm)) { - bs.setBegin(savedBegin); - return false; - } - - if (hasPunctuation.booleanValue() && !tryEatChar(bs, '-')) { - bs.setBegin(savedBegin); - return false; - } - if (!tryParseWholeNumber(bs, 2, 2, false, dd)) { - bs.setBegin(savedBegin); - return false; - } - return true; - } - - /** - * Parse (a prefix of) the input as hhmmss.nnnnnn or hh:mm:ss.nnnn and various variants (minutes, seconds, and - * nanos are optional, and the nanos separator is either period or comma). - * - * @param bs The input text. If the method returns true, the slice will be advanced past the parsed text. - * Otherwise (if the method returns false), the slice will be unchanged. - * @param punctuationRequired Indicates whether punctuation (namely colons) is required between the fields. - * @param hours Contains the parsed hours if this method returns true. Otherwise, the contents are unspecified. - * @param minutes Contains the parsed minutes if this method returns true. Otherwise, the contents are - * unspecified. - * @param seconds Contains the parsed seconds if this method returns true. Otherwise, the contents are - * unspecified. - * @param nanos Contains the parsed nanos if this method returns true. Otherwise, the contents are unspecified. - * @return true if the input was successfully parsed. Otherwise, false. - */ - private static boolean tryParseHHmmssNanos(final ByteSlice bs, final boolean punctuationRequired, - final MutableLong hours, - final MutableLong minutes, - final MutableLong seconds, final MutableLong nanos) { - final int savedBegin = bs.begin(); - // Hour - if (!tryParseWholeNumber(bs, 2, 2, false, hours)) { - return false; - } - // Set defaults for minutes, seconds, nanos, in case we exit early. - minutes.setValue(0); - seconds.setValue(0); - nanos.setValue(0); - - // Minutes, seconds, and nanos are optional. - - // If a colon is required but not present, then the parse is done (this is not an error). - if (punctuationRequired && !tryEatChar(bs, ':')) { - return true; - } - - // Try minutes - if (!tryParseWholeNumber(bs, 2, 2, false, minutes)) { - // Next thing is not a number. If we previously ingested a colon, not having a next number is an error. - // But if we did not ingest a colon, not having a number is ok. - // If we return false we are obligated to reset the slice. - minutes.setValue(0); // Sub-parse failed, but we still might return success. So this needs to be - // correct. - final boolean success = !punctuationRequired; - if (!success) { - bs.setBegin(savedBegin); - } - return success; - } - - // If a colon is required but not present, then the parse is done (this is not an error). - if (punctuationRequired && !tryEatChar(bs, ':')) { - return true; - } - - // Try seconds. - if (!tryParseWholeNumber(bs, 2, 2, false, seconds)) { - // Next thing is not a number. If we previously ingested a colon, not having a next number is an error. - // But if we did not ingest a colon, not having a number is ok. - // If we return false we are obligated to reset the slice. - seconds.setValue(0); // Sub-parse failed, but we still might return success. So this needs to be - // correct. - final boolean success = !punctuationRequired; - if (!success) { - bs.setBegin(savedBegin); - } - return success; - } - - if (!tryEatChar(bs, '.') && !tryEatChar(bs, ',')) { - // Period (or comma!) introduces fraction. If not present, then stop the parse here (with a success - // indication) - return true; - } - - // Try nanoseconds - final int beginBeforeNs = bs.begin(); - if (!tryParseWholeNumber(bs, 1, 9, false, nanos)) { - // If you couldn't get a number, that's a parse fail. - bs.setBegin(savedBegin); - return false; - } - - // Pad to the right with zeroes (that is, in "blah.12", the .12 is 120,000,000 nanos. - final int length = bs.begin() - beginBeforeNs; - for (int ii = length; ii < 9; ++ii) { - nanos.setValue(10 * nanos.getValue()); - } - return true; - } - - /** - * Try to parse (a prefix of) the input as a whole number. - * - * @param bs The input text. If the method returns true, the slice will be advanced past the parsed text. - * Otherwise (if the method returns false), the slice will be unchanged. - * @param minSize The parsed number must be at least this many digits. Otherwise, we will return false. - * @param maxSize The parsed number must be at most this many digits. We will stop the parse after this size, - * even if the parse could continue (e.g. even if a digit immediately follows). - * @param negate If we should negate the parsed number on the way out. - * @param result Contains the parsed whole number if this method returns true. Otherwise, the contents are - * unspecified. - * @return true if the input was successfully parsed. Otherwise, false. - */ - private static boolean tryParseWholeNumber(final ByteSlice bs, final int minSize, final int maxSize, - final boolean negate, final MutableLong result) { - final byte[] data = bs.data(); - final int begin = bs.begin(); - final int end = bs.end(); - final int size = bs.size(); - if (size < minSize) { - return false; - } - final int endToUse = Math.min(end, begin + maxSize); - long res = 0; - long prevRes = 0; - int current = begin; - // We build the number using negative values, because the negative range is slightly longer and this helps - // us when we happen to parse Long.MIN_VALUE. - for (; current < endToUse; ++current) { - final char ch = (char) data[current]; - if (!RangeTests.isDigit(ch)) { - break; - } - res = res * 10 - (ch - '0'); - if (res > prevRes) { - // Overflow. - return false; - } - prevRes = res; - } - if (current == begin) { - return false; - } - if (!negate) { - // Caller wanted a positive number, but we operate in a negative number system. - if (res == Long.MIN_VALUE) { - // Can't represent the negation of Long.MIN_VALUE. - return false; - } - res = -res; - } - result.setValue(res); - bs.setBegin(current); - return true; - } - - /** - * Try to parse (a prefix of) the input as an ISO time zone. For convenience/efficiency, the method is allowed - * to return either a ZoneOffset or a numerical offset in seconds (or both). - * - * @param bs The input text. If the method returns true, the slice will be advanced past the parsed text. - * Otherwise (if the method returns false), the slice will be unchanged. - * @param zoneId Contains the parsed time zone if this method returns true. Otherwise, the contents are - * unspecified. - * @param offsetSeconds Contains a time zone offset in seconds if this method returns true. Otherwise, the - * contents are unspecified. - * @return true if the input was successfully parsed. Otherwise, false. - */ - private static boolean tryParseIsoTimeZone(final ByteSlice bs, final MutableObject zoneId, - final MutableLong offsetSeconds) { - if (bs.size() == 0) { - return false; - } - - final char front = (char) bs.front(); - if (front == 'Z') { - zoneId.setValue(ZoneOffset.UTC); - offsetSeconds.setValue(0); - bs.setBegin(bs.begin() + 1); - return true; - } - - // Try an offset like +02 or +03:30 or -0400 - if (front != '+' && front != '-') { - return false; - } - final boolean negative = front == '-'; - - final int savedBegin = bs.begin(); - bs.setBegin(bs.begin() + 1); - - // Reuse offsetSeconds as temp variable - if (!tryParseWholeNumber(bs, 2, 2, false, offsetSeconds)) { - bs.setBegin(savedBegin); - return false; - } - final long hours = offsetSeconds.longValue(); - - // Optional colon - tryEatChar(bs, ':'); - - long minutes = 0; - if (bs.size() != 0) { - // Reuse offsetSeconds as temp variable - if (!tryParseWholeNumber(bs, 2, 2, false, offsetSeconds)) { - bs.setBegin(savedBegin); - return false; - } - minutes = offsetSeconds.longValue(); - } - zoneId.setValue(ZoneOffset.UTC); - - // If someone says yyyy-MM-DDThh:mm:ss-05 - // The "-05" means this is meant to be interpreted as UTC-5. - // If I parse yyyy-MM-DDThh:mm:ss in UTC (without any offset), it will be 5 hours later than - // what the user intended. So in other words, I need to negate the -05. - // Put more simply, when it is 1pm in the zone UTC-4, it is 5pm in the zone UTC. - // So to convert 1pm UTC-4 to 5pm UTC you need to *add* 4. - final long offset = ((hours * 60) + minutes) * 60; - offsetSeconds.setValue(negative ? offset : -offset); - return true; - } - } - - /** - * A pluggable interface for a user-supplied time zone parser. - */ - public interface CustomTimeZoneParser { - /** - * Try to parse a user-defined time zone. - * - * @param bs The input text. If the method returns true, the slice will be advanced past the parsed text. - * Otherwise (if the method returns false), the slice will be unchanged. - * @param zoneId Contains the parsed time zone if this method returns true. Otherwise, the contents are - * unspecified. - * @param offsetSeconds Contains a time zone offset in seconds if this method returns true. Otherwise, the - * contents are unspecified. - * @return true if the input was successfully parsed. Otherwise, false. - */ - boolean tryParse(final ByteSlice bs, final MutableObject zoneId, final MutableLong offsetSeconds); - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/util/CsvReaderException.java b/extensions/csv/src/main/java/io/deephaven/csv/util/CsvReaderException.java deleted file mode 100644 index 08a1bafd0e8..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/util/CsvReaderException.java +++ /dev/null @@ -1,14 +0,0 @@ -package io.deephaven.csv.util; - -/** - * The standard Exception class for various CSV errors. - */ -public class CsvReaderException extends Exception { - public CsvReaderException(String message) { - super(message); - } - - public CsvReaderException(String message, Throwable cause) { - super(message, cause); - } -} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/util/Renderer.java b/extensions/csv/src/main/java/io/deephaven/csv/util/Renderer.java deleted file mode 100644 index 4efdb3b6afa..00000000000 --- a/extensions/csv/src/main/java/io/deephaven/csv/util/Renderer.java +++ /dev/null @@ -1,65 +0,0 @@ -package io.deephaven.csv.util; - -import java.util.function.Function; - - -/** - * Utility class for rendering Iterables as a string. The methods can intersperse a comma (or other separator), and can - * take a custom function to render the item as a string. - */ -public class Renderer { - /** - * Render the items in {@code items} using the separator ", " and renderer {@link Object#toString}. - * - * @param items The items. - * @return The items rendered as a {@link String}, separated by {@code separator}. - */ - public static String renderList(Iterable items) { - return renderList(items, ", ", Object::toString); - } - - /** - * Render the items in {@code items} using a custom separator and renderer {@link Object#toString}. - * - * @param items The items. - * @param separator The separator. - * @return The items rendered as a {@link String}, separated by {@code separator}. - */ - public static String renderList(Iterable items, String separator) { - return renderList(items, separator, Object::toString); - } - - /** - * Render the items in {@code items} using a custom separator and custom renderer. - * - * @param items The items. - * @param separator The separator. - * @param renderer The renderer. - * @return The items rendered as a {@link String}, separated by {@code separator}. - */ - public static String renderList(Iterable items, final String separator, Function renderer) { - return renderList(new StringBuilder(), items, separator, renderer).toString(); - } - - /** - * Render the items in {@code items} to the {@link StringBuilder} sb, using the separator {@code separator}, and the - * custom rendering function {@code renderer}. - * - * @param sb The destination where the text is written to. - * @param items The items to render. - * @param separator THe separator to use. - * @param renderer A function that renders an individual item as a string. - * @param The element type of {@code items}. - * @return The passed in {@link StringBuilder} sb. - */ - public static StringBuilder renderList(StringBuilder sb, Iterable items, final String separator, - Function renderer) { - String separatorToUse = ""; - for (T item : items) { - sb.append(separatorToUse); - sb.append(renderer.apply(item)); - separatorToUse = separator; - } - return sb; - } -} diff --git a/extensions/csv/src/test/java/io/deephaven/csv/CsvReaderTest.java b/extensions/csv/src/test/java/io/deephaven/csv/CsvReaderTest.java deleted file mode 100644 index ceef087a3e7..00000000000 --- a/extensions/csv/src/test/java/io/deephaven/csv/CsvReaderTest.java +++ /dev/null @@ -1,2010 +0,0 @@ -package io.deephaven.csv; - -import gnu.trove.list.array.*; -import io.deephaven.csv.containers.ByteSlice; -import io.deephaven.csv.parsers.IteratorHolder; -import io.deephaven.csv.parsers.Parser; -import io.deephaven.csv.parsers.Parsers; -import io.deephaven.csv.reading.CsvReader; -import io.deephaven.csv.sinks.Sink; -import io.deephaven.csv.sinks.SinkFactory; -import io.deephaven.csv.tokenization.RangeTests; -import io.deephaven.csv.tokenization.Tokenizer; -import io.deephaven.csv.util.CsvReaderException; -import io.deephaven.csv.util.Renderer; -import org.apache.commons.io.input.ReaderInputStream; -import org.assertj.core.api.Assertions; -import org.jetbrains.annotations.NotNull; -import org.junit.Test; - -import java.io.*; -import java.lang.reflect.Array; -import java.math.BigDecimal; -import java.nio.charset.StandardCharsets; -import java.time.Instant; -import java.time.ZoneOffset; -import java.util.ArrayList; -import java.util.List; -import java.util.function.*; - -public class CsvReaderTest { - private static class Sentinels { - public static final byte NULL_BOOLEAN_AS_BYTE = Byte.MIN_VALUE; - public static final byte NULL_BYTE = Byte.MIN_VALUE; - public static final short NULL_SHORT = Short.MIN_VALUE; - public static final int NULL_INT = Integer.MIN_VALUE; - public static final long NULL_LONG = Long.MIN_VALUE; - public static final float NULL_FLOAT = -Float.MAX_VALUE; - public static final double NULL_DOUBLE = -Double.MAX_VALUE; - public static final char NULL_CHAR = Character.MAX_VALUE; - public static final long NULL_DATETIME_AS_LONG = Long.MIN_VALUE; - public static final long NULL_TIMESTAMP_AS_LONG = Long.MIN_VALUE; - } - - @Test - public void countsAreCorrect() throws CsvReaderException { - final String input = "" + - "Values\n" + - "1\n" + - "\n" + - "3\n"; - final CsvReader.Result result = parse(defaultCsvReader(), toInputStream(input)); - Assertions.assertThat(result.numCols()).isEqualTo(1); - Assertions.assertThat(result.numRows()).isEqualTo(3); - } - - @Test - public void countsAreCorrectNoTrailingNewline() throws CsvReaderException { - final String input = "" + - "Values\n" + - "1\n" + - "\n" + - "3"; - final CsvReader.Result result = parse(defaultCsvReader(), toInputStream(input)); - Assertions.assertThat(result.numCols()).isEqualTo(1); - Assertions.assertThat(result.numRows()).isEqualTo(3); - } - - @Test - public void countsAreCorrectHeaderless() throws CsvReaderException { - final String input = "" + - "1\n" + - "\n" + - "3\n"; - final CsvReader.Result result = - parse(defaultCsvReader().setHasHeaders(false).setHeaders("Value"), toInputStream(input)); - Assertions.assertThat(result.numCols()).isEqualTo(1); - Assertions.assertThat(result.numRows()).isEqualTo(3); - } - - @Test - public void multilineColumnName() throws CsvReaderException { - final String input = "" + - "|Some\nInts|,|Some\rStrings|,|Some\r\nBools|,|Some\r\n\nDoubles|\n" + - "-3,foo,false,1.0\n" + - "4,bar,true,2.0\n" + - "-5,baz,false,3.0\n"; - final CsvReader.Result result = parse(defaultCsvReader().setquoteChar('|'), toInputStream(input)); - final ColumnSet cs = toColumnSet(result); - Assertions.assertThat(cs.columns[0].name).isEqualTo("Some\nInts"); - Assertions.assertThat(cs.columns[1].name).isEqualTo("Some\rStrings"); - Assertions.assertThat(cs.columns[2].name).isEqualTo("Some\r\nBools"); - Assertions.assertThat(cs.columns[3].name).isEqualTo("Some\r\n\nDoubles"); - } - - @Test - public void multilineColumnNameReportsCorrectRowNumber() { - // Too many columns is an error. - final String input = "" + - "|Some\nInts|,|Some\rStrings|,|Some\r\nBools|,|Some\r\n\nDoubles|\n" + - "-3,foo,false,1.0\n" + - "4,bar,true,2.0,quz\n" + - "-5,baz,false,3.0\n"; - Assertions.assertThatThrownBy(() -> parse(defaultCsvReader().setquoteChar('|'), toInputStream(input))) - .hasRootCauseMessage("Row 8 has too many columns (expected 4)"); - } - - - private static final String BOOLEAN_INPUT = "" + - "Values\n" + - "true\n" + - "\n" + - "false\n" + - "True\n" + - "False\n" + - "TrUe\n" + - "FALSE\n"; - - @Test - public void booleans() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", (byte) 1, Sentinels.NULL_BOOLEAN_AS_BYTE, (byte) 0, (byte) 1, (byte) 0, - (byte) 1, (byte) 0).reinterpret(boolean.class)); - - invokeTest(defaultCsvReader(), BOOLEAN_INPUT, expected); - } - - private static final String CHAR_INPUT = "" + - "Values\n" + - "A\n" + - "\n" + - "B\n" + - "C\n" + - "1\n" + - "2\n" + - "3\n"; - - @Test - public void chars() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", 'A', Sentinels.NULL_CHAR, 'B', 'C', '1', '2', '3')); - - invokeTest(defaultCsvReader(), CHAR_INPUT, expected); - } - - @Test - public void forbiddenNullChars() throws CsvReaderException { - final String input = "" + - "Values\n" + - "A\n" + - Sentinels.NULL_CHAR + "\n"; - - // NULL_CHAR can't be parsed as char; will be promoted to String. - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Values", "A", "" + Sentinels.NULL_CHAR)); - - invokeTest(defaultCsvReader(), input, expected); - } - - private static final String BYTE_INPUT = "" + - "Values\n" + - "-127\n" + - "\n" + - "127\n"; - - @Test - public void byteViaInference() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", (byte) (Byte.MIN_VALUE + 1), Sentinels.NULL_BYTE, Byte.MAX_VALUE)); - - invokeTest(defaultCsvReader().setParsers(Parsers.COMPLETE), BYTE_INPUT, expected); - } - - @Test - public void forbiddenNullBytes() throws CsvReaderException { - final String input = "" + - "Values\n" + - "-127\n" + - Sentinels.NULL_BYTE + "\n" + - "127\n"; - // NULL_BYTE can't be parsed as char; will be promoted to short (because we're using - // the Parsers.COMPLETE set of parsers, and short is in Parsers.COMPLETE set). - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", (short) (Byte.MIN_VALUE + 1), Sentinels.NULL_BYTE, Byte.MAX_VALUE)); - - invokeTest(defaultCsvReader().setParsers(Parsers.COMPLETE), input, expected); - } - - @Test - public void byteIsInt() throws CsvReaderException { - // By default, byte will be parsed as int, because neither Parsers.BYTE nor Parsers.SHORT is in Parsers.DEFAULT - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", (Byte.MIN_VALUE + 1), Sentinels.NULL_INT, Byte.MAX_VALUE)); - - invokeTest(defaultCsvReader(), BYTE_INPUT, expected); - } - - private static final String SHORT_INPUT = "" + - "Values\n" + - "-32767\n" + - "\n" + - "32767\n"; - - @Test - public void shorts() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", (short) (Short.MIN_VALUE + 1), Sentinels.NULL_SHORT, Short.MAX_VALUE)); - - invokeTest(defaultCsvReader().setParsers(Parsers.COMPLETE), SHORT_INPUT, expected); - } - - @Test - public void forbiddenNullShorts() throws CsvReaderException { - final String input = "" + - "Values\n" + - "-32767\n" + - Sentinels.NULL_SHORT + "\n" + - "32767\n"; - - // NULL_SHORT can't be parsed as short; will be promoted to int. - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", (int) (Short.MIN_VALUE + 1), Sentinels.NULL_SHORT, Short.MAX_VALUE)); - - invokeTest(defaultCsvReader().setParsers(Parsers.COMPLETE), input, expected); - } - - @Test - public void ints() throws CsvReaderException { - final String input = "" + - "Values\n" + - "-2147483647\n" + - "\n" + - "2147483647\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", Integer.MIN_VALUE + 1, Sentinels.NULL_INT, Integer.MAX_VALUE)); - - invokeTest(defaultCsvReader(), input, expected); - } - - @Test - public void forbiddenNullInts() throws CsvReaderException { - final String input = "" + - "Values\n" + - Sentinels.NULL_INT + "\n"; - - // NULL_INT can't be parsed as int; will be promoted to long. - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", (long) Sentinels.NULL_INT)); - - invokeTest(defaultCsvReader(), input, expected); - } - - private static final String LONG_INPUT = "" + - "Values\n" + - "-9223372036854775807\n" + - "\n" + - "9223372036854775807\n"; - - @Test - public void longs() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", Long.MIN_VALUE + 1, Sentinels.NULL_LONG, Long.MAX_VALUE)); - - invokeTest(defaultCsvReader(), LONG_INPUT, expected); - } - - @Test - public void forbiddenNullLongs() throws CsvReaderException { - final String input = "" + - "Values\n" + - Sentinels.NULL_LONG + "\n"; - - // NULL_LONG can't be parsed as long; will be promoted to double. - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", (double) Sentinels.NULL_LONG)); - - invokeTest(defaultCsvReader(), input, expected); - } - - @Test - public void longAsStringsViaInference() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Values", "-9223372036854775807", null, "9223372036854775807")); - - invokeTest(defaultCsvReader().setParsers(List.of(Parsers.STRING)), LONG_INPUT, expected); - } - - @Test - public void longAsStringsViaParser() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Values", "-9223372036854775807", null, "9223372036854775807")); - - invokeTest(defaultCsvReader().setParserFor("Values", Parsers.STRING), LONG_INPUT, expected); - } - - private static final String FLOAT_INPUT = "" + - "Values\n" + - "Infinity\n" + - "\n" + - "-Infinity\n" + - "NaN\n" + - "3.4028234e+38\n" + - "1.17549435E-38\n" + - "1.4e-45\n"; - - @Test - public void floatIsDouble() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", - Float.POSITIVE_INFINITY, - Sentinels.NULL_DOUBLE, - Float.NEGATIVE_INFINITY, - Float.NaN, - 3.4028234e+38d, - 1.17549435E-38d, - 1.4e-45d)); - - invokeTest(defaultCsvReader(), FLOAT_INPUT, expected); - } - - @Test - public void floatViaInference() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", - Float.POSITIVE_INFINITY, - Sentinels.NULL_FLOAT, - Float.NEGATIVE_INFINITY, - Float.NaN, - Float.MAX_VALUE, - Float.MIN_NORMAL, - Float.MIN_VALUE)); - - invokeTest(defaultCsvReader().setParsers(List.of(Parsers.FLOAT_FAST)), FLOAT_INPUT, expected); - } - - @Test - public void forbiddenNullFloats() throws CsvReaderException { - final String input = "" + - "Values\n" + - Sentinels.NULL_FLOAT + "\n"; - - // I wanted to say simply (double)Sentinels.NULL_FLOAT, but that's a different number from - // the below (alas). - final double nullFloatAsParsedByDouble = Double.parseDouble("" + Sentinels.NULL_FLOAT); - - // NULL_FLOAT can't be parsed as float; will be promoted to double. - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", nullFloatAsParsedByDouble)); - - invokeTest(defaultCsvReader().setParsers(Parsers.COMPLETE), input, expected); - } - - @Test - public void doubleRange() throws CsvReaderException { - final String input = "" + - "Values\n" + - "Infinity\n" + - "\n" + - "-Infinity\n" + - "NaN\n" + - "1.7976931348623157e+308\n" + - "2.2250738585072014E-308\n" + - "4.9e-324\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", - Double.POSITIVE_INFINITY, - Sentinels.NULL_DOUBLE, - Double.NEGATIVE_INFINITY, - Double.NaN, - Double.MAX_VALUE, - Double.MIN_NORMAL, - Double.MIN_VALUE)); - - invokeTest(defaultCsvReader(), input, expected); - } - - @Test - public void forbiddenNullDoubles() throws CsvReaderException { - final String input = "" + - "Values\n" + - Sentinels.NULL_DOUBLE + "\n"; - - // NULL_DOUBLE can't be parsed as double; will be promoted to String - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Values", Sentinels.NULL_DOUBLE + "")); - - invokeTest(defaultCsvReader().setParsers(Parsers.COMPLETE), input, expected); - } - - @Test - public void varietyOfNumerics() throws CsvReaderException { - final String input = "" + - "Values\n" + - "\n" + // NULL - "\n" + // NULL - "0\n" + // byte - "1\n" + // byte - "300\n" + // short - "400\n"; // short - // "100000\n" + // int - // "100001\n" + // int - // "3000000000\n" + // long - // "123.456\n" + // float - // "1234.5678\n"; // double - - // NULL_DOUBLE can't be parsed as double; will be promoted to String - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", Sentinels.NULL_SHORT, Sentinels.NULL_SHORT, (short) 0, (short) 1, (short) 300, - (short) 400)); - - invokeTest(defaultCsvReader().setParsers(Parsers.COMPLETE), input, expected); - } - - - @Test - public void strings() throws CsvReaderException { - final String input = "" + - "Values\n" + - "\"Hello, world\"\n" + - "\n" + // the empty string is null - "Goodbye.\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Values", - "Hello, world", - null, - "Goodbye.")); - - invokeTest(defaultCsvReader(), input, expected); - } - - @Test - public void multi() throws CsvReaderException { - // These are columns of data. We are going to mix and match them. - final String booleanInput = "false\ntrUe\nFaLsE\n"; - final String byteInput1 = "1\n2\n3\n"; - final String byteInput2 = "-1\n-2\n-3\n"; - final String shortInput = "300\n301\n302\n"; - final String intInput = "50000\n50001\n50002\n"; - final String longInput = "3000000000\n3000000001\n3000000002\n"; - final String doubleInput = "123.456\n234.567e25\n987.654e-20\n"; - final String dateTimeInput = "1966-03-01 12:34:56Z\n1977-02-08 03:04:05Z\n1989-11-11 11:11:11Z\n"; - final String charInput = "a\nb\nc\n"; - final String stringInput = "Deephaven\nStreaming\nJoins\n"; - - final String[] allInputs = { - booleanInput, byteInput1, byteInput2, shortInput, intInput, longInput, doubleInput, dateTimeInput, - charInput, stringInput - }; - final Class[] expectedTypes = { - boolean.class, byte.class, byte.class, short.class, int.class, long.class, double.class, - Instant.class, char.class, String.class - }; - final boolean[] entriesAreAllNullOrOneChar = { - false, true, false, false, false, false, false, false, true, false - }; - - for (int ii = 0; ii < allInputs.length; ++ii) { - for (int jj = 0; jj < allInputs.length; ++jj) { - final boolean oneCharIJ = entriesAreAllNullOrOneChar[ii] && entriesAreAllNullOrOneChar[jj]; - final Class inferredIJ = SimpleInferrer.infer(expectedTypes[ii], expectedTypes[jj], oneCharIJ); - for (int kk = 0; kk < allInputs.length; ++kk) { - final boolean oneCharIJK = oneCharIJ && entriesAreAllNullOrOneChar[kk]; - final Class expectedType = SimpleInferrer.infer(expectedTypes[kk], inferredIJ, oneCharIJK); - final String input = "Values\n" + allInputs[ii] + allInputs[jj] + allInputs[kk]; - final InputStream inputStream = toInputStream(input); - final CsvReader csvReader = defaultCsvReader().setParsers(Parsers.COMPLETE); - final ColumnSet columnSet = toColumnSet(parse(csvReader, inputStream)); - final Class actualType = columnSet.columns[0].reinterpretedType; - Assertions.assertThat(actualType) - .withFailMessage("Expected to infer type %s; actually inferred %s. Failing input: %s", - expectedType.getCanonicalName(), actualType.getCanonicalName(), input) - .isEqualTo(expectedType); - } - } - } - } - - private static class SimpleInferrer { - private static final int BOOLEAN = 1; - private static final int DATETIME = 2; - private static final int STRING = 3; - private static final int CHAR = 4; - private static final int BYTE = 5; - private static final int SHORT = 6; - private static final int INT = 7; - private static final int LONG = 8; - private static final int FLOAT = 9; - private static final int DOUBLE = 10; - - public static Class infer(final Class type1, final Class type2, final boolean allNullOrOneChar) { - // Same types yield that type. - if (type1 == type2) { - return type1; - } - - final int priority1 = getPriority(type1); - final int priority2 = getPriority(type2); - - final int highestPriority = Math.min(priority1, priority2); - final Class widestType = priority1 < priority2 ? type2 : type1; - - // (Boolean, DateTime, or String) and (something else) yields String. - if (highestPriority == BOOLEAN || highestPriority == DATETIME || highestPriority == STRING) { - return String.class; - } - - // Char paired with some numeric will yield char if the numerics are one digit wide; otherwise String - if (highestPriority == CHAR) { - return allNullOrOneChar ? char.class : String.class; - } - - // Numeric types yield the widest type. - return widestType; - } - - private static int getPriority(Class type) { - if (type == boolean.class) - return BOOLEAN; - if (type == Instant.class) - return DATETIME; - if (type == char.class) - return CHAR; - if (type == String.class) - return STRING; - if (type == byte.class) - return BYTE; - if (type == short.class) - return SHORT; - if (type == int.class) - return INT; - if (type == long.class) - return LONG; - if (type == float.class) - return FLOAT; - if (type == double.class) - return DOUBLE; - throw new RuntimeException("Unexpected type " + type.getCanonicalName()); - } - } - - @Test - public void quotingSuccessfulEdgeCases() throws CsvReaderException { - final String input = "" + - "Values\n" + - "##\n" + // the empty string, which is configured below to give us NULL - "####\n" + // # - "######\n"; // ## - - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Values", - null, - "#", - "##")); - - invokeTest(defaultCsvReader().setquoteChar('#'), input, expected); - } - - @Test - public void quotingFailingEdgeCases() { - final String input = "" + - "Values\n" + - "###\n"; // invalid - - Assertions.assertThatThrownBy(() -> invokeTest(defaultCsvReader().setquoteChar('#'), input, ColumnSet.NONE)) - .hasRootCauseMessage("Cell did not have closing quote character"); - } - - @Test - public void quotingExcessMaterial() { - final String input = "" + - "Val1,Val2\n" + - "#hello#junk,there\n"; // invalid - - Assertions.assertThatThrownBy(() -> invokeTest(defaultCsvReader().setquoteChar('#'), input, ColumnSet.NONE)) - .hasRootCauseMessage("Logic error: final non-whitespace in field is not quoteChar"); - } - - @Test - public void stringWithNullLiteralSetAndValueNull() throws CsvReaderException { - // It should work when the null literal is set to something special, but the null String value is the null - // reference. - final String input = "" + - "Values\n" + - "hello\n" + - "NULL\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Values", "hello", null)); - - invokeTest(new CsvReader().setNullValueLiteral("NULL"), input, expected); - } - - @Test - public void stringsPound() throws CsvReaderException { - final String input = "" + - "Values\n" + - "#Hello, world#\n" + - "\n" + - "Goodbye.\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Values", - "Hello, world", - null, - "Goodbye.")); - - invokeTest(defaultCsvReader().setquoteChar('#'), input, expected); - } - - - @Test - public void newlineDiversity() throws CsvReaderException { - final String input = "" + - "Values\r" + - "-2147483647\r\n" + - "\n" + - "2147483647\r\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", Integer.MIN_VALUE + 1, Sentinels.NULL_INT, Integer.MAX_VALUE)); - - invokeTest(defaultCsvReader(), input, expected); - } - - @Test - public void overrideHeaders() throws CsvReaderException { - final String input = "" + - "Foo,Bar,Baz\n" + - "1,2,3\n" + - "4,5,6\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("A", 1, 4), - Column.ofValues("Qux", 2, 5), - Column.ofValues("C", 3, 6)); - - invokeTest(defaultCsvReader() - .setHeaders("A", "B", "C") - .setHeader(2, "Qux"), input, expected); - } - - - private static final String LANGUAGE_EXAMPLE_HEADERLESS_INPUT = "" + - "C,Dennis Ritchie,Compiled\n" + - "C++,Bjarne Stroustrup,Compiled\n" + - "Fortran,John Backus,Compiled\n" + - "Java,James Gosling,Both\n" + - "JavaScript,Brendan Eich,Interpreted\n" + - "MATLAB,Cleve Moler,Interpreted\n" + - "Pascal,Niklaus Wirth,Compiled\n" + - "Python,Guido van Rossum,Interpreted\n"; - - private static final String LANGUAGE_EXAMPLE_INPUT = "" + - "Language,Creator,Type\n" + - LANGUAGE_EXAMPLE_HEADERLESS_INPUT; - - private static final String LANGUAGE_EXAMPLE_TSV = "" + - "Language\tCreator\tType\n" + - "C\tDennis Ritchie\tCompiled\n" + - "C++\tBjarne Stroustrup\tCompiled\n" + - "Fortran\tJohn Backus\tCompiled\n" + - "Java\tJames Gosling\tBoth\n" + - "JavaScript\tBrendan Eich\tInterpreted\n" + - "MATLAB\tCleve Moler\tInterpreted\n" + - "Pascal\tNiklaus Wirth\tCompiled\n" + - "Python\tGuido van Rossum\tInterpreted\n"; - - @Test - public void languageExample() throws CsvReaderException { - invokeTest(defaultCsvReader(), LANGUAGE_EXAMPLE_INPUT, languageCreatorTypeTable()); - } - - @Test - public void languageExampleTsv() throws CsvReaderException { - invokeTest(defaultCsvReader().setFieldDelimiter('\t'), LANGUAGE_EXAMPLE_TSV, languageCreatorTypeTable()); - } - - @Test - public void languageExampleHeaderless() throws CsvReaderException { - invokeTest(defaultCsvReader().setHasHeaders(false), LANGUAGE_EXAMPLE_HEADERLESS_INPUT, - languageCreatorTypeTableHeaderless()); - } - - @Test - public void languageExampleHeaderlessExplicit() throws CsvReaderException { - final ColumnSet expected = languageCreatorTypeTable(); - invokeTest(defaultCsvReader() - .setHasHeaders(false) - .setHeaders(List.of("Language", "Creator", "Type")), - LANGUAGE_EXAMPLE_HEADERLESS_INPUT, expected); - } - - private static ColumnSet languageCreatorTypeTable() { - return populateLanguageExample("Language", "Creator", "Type"); - } - - private static ColumnSet languageCreatorTypeTableHeaderless() { - return populateLanguageExample("Column1", "Column2", "Column3"); - } - - private static ColumnSet populateLanguageExample(final String col1, final String col2, final String col3) { - return ColumnSet.of( - Column.ofRefs(col1, "C", "C++", "Fortran", "Java", - "JavaScript", "MATLAB", "Pascal", "Python"), - Column.ofRefs(col2, "Dennis Ritchie", "Bjarne Stroustrup", "John Backus", "James Gosling", - "Brendan Eich", "Cleve Moler", "Niklaus Wirth", "Guido van Rossum"), - Column.ofRefs(col3, "Compiled", "Compiled", "Compiled", "Both", - "Interpreted", "Interpreted", "Compiled", "Interpreted")); - } - - private static final String WHITESPACE_NO_QUOTES = "" + - "Sym,Type,Price,SecurityId\n" + - "GOOG, Dividend, 0.25, 200\n" + - "T, Dividend, 0.15, 300\n" + - " Z, Dividend, 0.18, 500\n"; - - @Test - public void whitespaceNoQuotes() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Sym", "GOOG", "T", "Z"), - Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), - Column.ofValues("Price", 0.25, 0.15, 0.18), - Column.ofValues("SecurityId", 200, 300, 500)); - - invokeTest(defaultCsvReader(), WHITESPACE_NO_QUOTES, expected); - } - - @Test - public void whitespaceNoQuotesLiteral() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Sym", "GOOG", "T", " Z"), - Column.ofRefs("Type", " Dividend", " Dividend", " Dividend"), - Column.ofValues("Price", 0.25, 0.15, 0.18), - Column.ofValues("SecurityId", 200, 300, 500)); - - invokeTest(defaultCsvReader().setIgnoreSurroundingSpaces(false), WHITESPACE_NO_QUOTES, expected); - } - - @Test - public void whitespaceOutside() throws CsvReaderException { - // Use vertical bars instead of quotation marks to make things more readable for the humans looking at this. - final String input = ("" + - "Sym,Type,Price,SecurityId\n" + - "|GOOG|, |Dividend|, |0.25|, |200|\n" + - "|T|, |Dividend|, |0.15|, |300|\n" + - " |Z|, |Dividend|, |0.18|, |500|\n"); - - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Sym", "GOOG", "T", "Z"), - Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), - Column.ofValues("Price", 0.25, 0.15, 0.18), - Column.ofValues("SecurityId", 200, 300, 500)); - - invokeTest(defaultCsvReader().setquoteChar('|'), input, expected); - } - - // Use vertical bars instead of quotation marks to make things more readable for the humans looking at this. - private static final String WHITESPACE_INSIDE = "" + - "Sym,Type,Price,SecurityId\n" + - "|GOOG|,| Dividend|,| 0.25|,| 200|\n" + - "|T|,|Dividend |,| 0.15|,| 300|\n" + - "| Z|,| Dividend |,| 0.18|,| 500|\n"; - - @Test - public void whitespaceInsideDefault() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Sym", "GOOG", "T", " Z"), - Column.ofRefs("Type", " Dividend", "Dividend ", " Dividend "), - Column.ofValues("Price", 0.25, 0.15, 0.18), - Column.ofValues("SecurityId", 200, 300, 500)); - invokeTest(defaultCsvReader().setquoteChar('|'), WHITESPACE_INSIDE, expected); - } - - @Test - public void whitespaceInsideTrim() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Sym", "GOOG", "T", "Z"), - Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), - Column.ofValues("Price", 0.25, 0.15, 0.18), - Column.ofValues("SecurityId", 200, 300, 500)); - - invokeTest(defaultCsvReader().setquoteChar('|').setTrim(true), WHITESPACE_INSIDE, expected); - } - - private static final String WHITESPACE_INSIDE_AND_OUTSIDE = "" + - "Sym,Type,Price,SecurityId\n" + - "|GOOG|, | Dividend|, | 0.25|, | 200|\n" + - "|T|, | Dividend|, | 0.15|, | 300|\n" + - "| Z|, | Dividend|, | 0.18|, | 500|\n"; - - @Test - public void whitespaceInsideAndOutsideDefault() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Sym", "GOOG", "T", " Z"), - Column.ofRefs("Type", " Dividend", " Dividend", " Dividend"), - Column.ofValues("Price", 0.25, 0.15, 0.18), - Column.ofValues("SecurityId", 200, 300, 500)); - - invokeTest(defaultCsvReader().setquoteChar('|'), WHITESPACE_INSIDE_AND_OUTSIDE, expected); - } - - @Test - public void whitespaceInsideAndOutsideTrim() throws CsvReaderException { - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Sym", "GOOG", "T", "Z"), - Column.ofRefs("Type", "Dividend", "Dividend", "Dividend"), - Column.ofValues("Price", 0.25, 0.15, 0.18), - Column.ofValues("SecurityId", 200, 300, 500)); - - invokeTest(defaultCsvReader().setquoteChar('|').setTrim(true), WHITESPACE_INSIDE_AND_OUTSIDE, expected); - } - - @Test - public void noTrailingNewlineHeaderOnly() throws CsvReaderException { - // Sometimes there is no trailing newline. That's OK. - final String input = "" + - "Values1,Values2"; - - final ColumnSet expected = ColumnSet.of( - Column.ofArray("Values1", new short[0]), - Column.ofArray("Values2", new short[0])); - - invokeTest(defaultCsvReader().setNullParser(Parsers.SHORT), input, expected); - } - - @Test - public void noTrailingNewline() throws CsvReaderException { - // Sometimes there is no trailing newline. That's OK. - final String input = "" + - "SomeInts,SomeStrings\n" + - "-3,foo\n" + - "4,bar\n" + - "-5,baz"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("SomeInts", -3, 4, -5), - Column.ofRefs("SomeStrings", "foo", "bar", "baz")); - - invokeTest(defaultCsvReader(), input, expected); - } - - @Test - public void tooFewColumnsWithFinalNewline() throws CsvReaderException { - // If there are too few columns, we just pad with nulls. - final String input = "" + - "A,B,C,D\n" + - "-3,foo,1.2,false\n" + - "4,bar,3.4,true\n" + - "-5\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("A", -3, 4, -5), - Column.ofRefs("B", "foo", "bar", null), - Column.ofValues("C", 1.2, 3.4, Sentinels.NULL_DOUBLE), - Column.ofValues("D", (byte) 0, (byte) 1, Sentinels.NULL_BOOLEAN_AS_BYTE).reinterpret(boolean.class)); - - invokeTest(defaultCsvReader(), input, expected); - } - - @Test - public void tooFewColumnsWithoutFinalNewline() throws CsvReaderException { - // If there are too few columns, we just pad with nulls. - final String input = "" + - "A,B,C,D\n" + - "-3,foo,1.2,false\n" + - "4,bar,3.4,true\n" + - "-5"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("A", -3, 4, -5), - Column.ofRefs("B", "foo", "bar", null), - Column.ofValues("C", 1.2, 3.4, Sentinels.NULL_DOUBLE), - Column.ofValues("D", (byte) 0, (byte) 1, Sentinels.NULL_BOOLEAN_AS_BYTE).reinterpret(boolean.class)); - - invokeTest(defaultCsvReader(), input, expected); - } - - @Test - public void tooManyColumns() { - // Too many columns is an error. - final String input = "" + - "SomeInts,SomeStrings\n" + - "-3,foo\n" + - "4,bar,quz\n" + - "-5,baz\n"; - - Assertions.assertThatThrownBy(() -> invokeTest(defaultCsvReader(), input, ColumnSet.NONE)) - .hasRootCauseMessage("Row 3 has too many columns (expected 2)"); - } - - @Test - public void duplicateColumnName() { - final String input = "" + - "abc,xyz,abc\n" + - "Hello,there,Deephaven\n"; - Assertions.assertThatThrownBy(() -> invokeTest(defaultCsvReader(), input, ColumnSet.NONE)) - .hasMessageContaining("Repeated headers: abc"); - } - - @Test - public void trailingNullColumnElided() throws CsvReaderException { - // A completely-empty rightmost column (corresponding to a text file with trailing field delimiters on every - // line) will just be dropped. - final String input = "" + - "abc,def,ghi,\n" + - "Hello,there,Deephaven,\n" + - "foo,bar,baz,\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("abc", "Hello", "foo"), - Column.ofRefs("def", "there", "bar"), - Column.ofRefs("ghi", "Deephaven", "baz")); - - invokeTest(defaultCsvReader(), input, expected); - } - - @Test - public void trailingNullColumnMustBeEmpty() { - // A completely-empty rightmost column (corresponding to a text file with trailing field delimiters on every - // line) will just be dropped. - final String input = "" + - "abc,def,ghi,\n" + - "Hello,there,Deephaven,\n" + - "foo,bar,baz,nonempty\n"; - Assertions.assertThatThrownBy(() -> invokeTest(defaultCsvReader(), input, ColumnSet.NONE)) - .hasRootCauseMessage("Column assumed empty but contains data"); - } - - @Test - public void dateTimes() throws CsvReaderException { - final String input = "" + - "Values\n" + - "2021-09-27T19:00:00Z\n" + - "\n" + - "2021-09-27T20:00:00Z\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", 1632769200000000000L, Sentinels.NULL_LONG, 1632772800000000000L) - .reinterpret(Instant.class)); - - invokeTest(defaultCsvReader(), input, expected); - } - - @Test - public void dateTimeFormats() throws CsvReaderException { - final String input = "" + - "Values\n" + - "20210927T19Z\n" + - "20210927 19Z\n" + - "20210927T1934Z\n" + - "20210927T193458Z\n" + - "20210927T193458.123Z\n" + - "20210927T193458.123456Z\n" + - "20210927T193458.123456789Z\n" + - "20210927T193458.123456789+0200\n" + - "20210927T193458.123456789-0330\n" + - - "2021-09-27T19Z\n" + - "2021-09-27 19Z\n" + - "2021-09-27T19:34Z\n" + - "2021-09-27T19:34:58Z\n" + - "2021-09-27T19:34:58.123Z\n" + - "2021-09-27T19:34:58.123456Z\n" + - "2021-09-27T19:34:58.123456789Z\n" + - "2021-09-27T19:34:58.123456789+0200\n" + - "2021-09-27T19:34:58.123456789-0330\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", - 1632769200000000000L, - 1632769200000000000L, - 1632771240000000000L, - 1632771298000000000L, - 1632771298123000000L, - 1632771298123456000L, - 1632771298123456789L, - 1632764098123456789L, - 1632783898123456789L, - - 1632769200000000000L, - 1632769200000000000L, - 1632771240000000000L, - 1632771298000000000L, - 1632771298123000000L, - 1632771298123456000L, - 1632771298123456789L, - 1632764098123456789L, - 1632783898123456789L) - .reinterpret(Instant.class)); - - invokeTest(defaultCsvReader(), input, expected); - } - - - @Test - public void timestampSeconds() throws CsvReaderException { - final String input = "" + - "Values\n" + - "1632769200\n" + - "\n" + - "1632772800\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", 1632769200000000000L, Sentinels.NULL_TIMESTAMP_AS_LONG, - 1632772800000000000L).reinterpret(Instant.class)); - - invokeTest(defaultCsvReader().setParsers(List.of(Parsers.TIMESTAMP_SECONDS)), input, expected); - } - - @Test - public void timestampMillis() throws CsvReaderException { - final String input = "" + - "Values\n" + - "1632769200000\n" + - "\n" + - "1632772800000\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", 1632769200000000000L, Sentinels.NULL_TIMESTAMP_AS_LONG, - 1632772800000000000L).reinterpret(Instant.class)); - - invokeTest(defaultCsvReader().setParsers(List.of(Parsers.TIMESTAMP_MILLIS)), input, expected); - } - - @Test - public void timestampMicros() throws CsvReaderException { - final String input = "" + - "Values\n" + - "1632769200000000\n" + - "\n" + - "1632772800000000\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", 1632769200000000000L, Sentinels.NULL_TIMESTAMP_AS_LONG, - 1632772800000000000L).reinterpret(Instant.class)); - - invokeTest(defaultCsvReader().setParsers(List.of(Parsers.TIMESTAMP_MICROS)), input, expected); - } - - @Test - public void timestampNanos() throws CsvReaderException { - final String input = "" + - "Values\n" + - "1632769200000000000\n" + - "\n" + - "1632772800000000000\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", 1632769200000000000L, Sentinels.NULL_TIMESTAMP_AS_LONG, - 1632772800000000000L).reinterpret(Instant.class)); - - invokeTest(defaultCsvReader().setParsers(List.of(Parsers.TIMESTAMP_NANOS)), input, expected); - } - - @Test - public void dateTimeCustomizedTimezone() throws CsvReaderException { - final String input = "" + - "Values\n" + - "2021-09-27T19:00:00 UTC\n" + - "\n" + - "2021-09-27T20:00:00 UTC\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", 1632769200000000000L, Sentinels.NULL_LONG, 1632772800000000000L) - .reinterpret(Instant.class).reinterpret(Instant.class)); - - // Simple custom time zone parser that only understands " UTC" - Tokenizer.CustomTimeZoneParser myTimeZoneParser = (bs, tzo, off) -> { - if (bs.size() < 4) { - return false; - } - final byte[] d = bs.data(); - final int o = bs.begin(); - if (d[o] == ' ' && d[o + 1] == 'U' && d[o + 2] == 'T' && d[o + 3] == 'C') { - tzo.setValue(ZoneOffset.UTC); - off.setValue(0); - bs.setBegin(bs.begin() + 4); - return true; - } - return false; - }; - - invokeTest(defaultCsvReader() - .setCustomTimeZoneParser(myTimeZoneParser), - input, expected); - } - - private static final String ALL_NULLS = "" + - "Values\n" + - "\n" + - "\n" + - "\n" + - "\n" + - "\n"; - - @Test - public void unparseable() { - final String input = "" + - "Values\n" + - "hello\n" + - "there\n"; - - Assertions.assertThatThrownBy(() -> invokeTest(defaultCsvReader() - .setParsers(List.of(Parsers.INT, Parsers.LONG, Parsers.DATETIME)), input, ColumnSet.NONE)); - } - - @Test - public void noParsers() { - final String input = "" + - "Values\n" + - "hello\n" + - "there\n"; - - Assertions.assertThatThrownBy(() -> invokeTest(defaultCsvReader().setParsers(List.of()), input, ColumnSet.NONE)) - .hasRootCauseMessage("No available parsers."); - } - - @Test - public void allNullsWithSpecifiedParser() throws CsvReaderException { - final long nv = Sentinels.NULL_LONG; - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", nv, nv, nv, nv, nv)); - - invokeTest(defaultCsvReader().setParserFor("Values", Parsers.LONG), ALL_NULLS, expected); - } - - @Test - public void allNullsWithNullParser() throws CsvReaderException { - final long nv = Sentinels.NULL_LONG; - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Values", nv, nv, nv, nv, nv)); - - invokeTest(defaultCsvReader().setNullParser(Parsers.LONG), ALL_NULLS, expected); - } - - @Test - public void allNullsButNoParser() { - Assertions.assertThatThrownBy(() -> invokeTest(defaultCsvReader(), ALL_NULLS, ColumnSet.NONE)) - .hasRootCauseMessage( - "Column contains all null cells: can't infer type of column, and nullParser is not set."); - } - - @Test - public void emptyTableWithSpecifiedParser() throws CsvReaderException { - final String input = "Values\n"; - final ColumnSet expected = ColumnSet.of( - Column.ofArray("Values", new long[0])); - - invokeTest(defaultCsvReader().setParserFor("Values", Parsers.LONG), input, expected); - } - - @Test - public void unicode() throws CsvReaderException { - final String input = "" + - "Emojis\n" + - "Hello 💖\n" + - "Regular ASCII\n" + - "✨ Deephaven ✨\n" + - "🎆🎆🎆🎆🎆\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("Emojis", "Hello 💖", "Regular ASCII", "✨ Deephaven ✨", "🎆🎆🎆🎆🎆")); - - invokeTest(defaultCsvReader(), input, expected); - } - - /** - * Test that input will be parsed as a char so long as it is in the BMP. The input is "tricky" because it starts out - * looking like integrals. - */ - @Test - public void unicodeChars() throws CsvReaderException { - // So long as a character is in the BMP (i.e. <= U+FFFF), it will be parsed as a char column. - final String input = "" + - "BMPChar\n" + - "1\n" + - "2\n" + - "3\n" + - "X\n" + - "✈\n" + - "❎\n" + - "➉\n" + - "✈\n" + - "✨\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("BMPChar", '1', '2', '3', 'X', '✈', '❎', '➉', '✈', '✨')); - - invokeTest(defaultCsvReader(), input, expected); - } - - /** - * Large cells (10K characters or so), some with fancy Unicode, quotes, and escaped quotes. - */ - @Test - public void largeCells() throws CsvReaderException { - final StringBuilder sbBytes = new StringBuilder(); - final StringBuilder sbChars = new StringBuilder(); - final StringBuilder sbQuotesEscaped = new StringBuilder(); - final StringBuilder sbQuotesLiteral = new StringBuilder(); - for (int ii = 0; ii < 1000; ++ii) { - sbBytes.append("Deephaven!"); - sbChars.append("🍣Deep🍔haven!🍕"); - sbQuotesEscaped.append("Deep\"\"haven!"); - sbQuotesLiteral.append("Deep\"haven!"); - } - final String largeCellBytes = sbBytes.toString(); - final String largeCellChars = sbChars.toString(); - final String largeCellEscaped = '"' + sbQuotesEscaped.toString() + '"'; - final String largeCellLiteral = sbQuotesLiteral.toString(); - - final String input = "" + - "LargeEmojis\n" + - largeCellBytes + "\n" + - largeCellChars + "\n" + - largeCellEscaped + "\n" + - largeCellBytes + "\n" + - largeCellChars + "\n" + - largeCellEscaped + "\n"; - - System.out.println(input); - final ColumnSet expected = ColumnSet.of( - Column.ofRefs("LargeEmojis", largeCellBytes, largeCellChars, largeCellLiteral, - largeCellBytes, largeCellChars, largeCellLiteral)); - - invokeTest(defaultCsvReader(), input, expected); - } - - /** - * Test the global null literal value. - */ - @Test - public void customGlobalNullValue() throws CsvReaderException { - final String input = "" + - "SomeBytes,SomeShorts,SomeInts,SomeLongs\n" + - "1,2,3,4\n" + - "NULL,NULL,NULL,NULL\n" + - "100,32000,2000000000,4000000000\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("SomeBytes", (byte) 1, Sentinels.NULL_BYTE, (byte) 100), - Column.ofValues("SomeShorts", (short) 2, Sentinels.NULL_SHORT, (short) 32000), - Column.ofValues("SomeInts", 3, Sentinels.NULL_INT, 2000000000), - Column.ofValues("SomeLongs", 4L, Sentinels.NULL_LONG, 4000000000L)); - - invokeTest(defaultCsvReader().setParsers(Parsers.COMPLETE).setNullValueLiteral("NULL"), input, expected); - } - - /** - * Test column-specific null literals values which may be specified by column name or index, and also show that - * Unicode characters work as the null literal. - */ - @Test - public void customColumnSpecificNullValue() throws CsvReaderException { - final String input = "" + - "SomeBytes,SomeShorts,SomeInts,SomeLongs\n" + - "1,2,3,4\n" + - "❌,🔥,⋰⋱,𝓓𝓮𝓮𝓹𝓱𝓪𝓿𝓮𝓷\n" + - "100,32000,2000000000,4000000000\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("SomeBytes", (byte) 1, Sentinels.NULL_BYTE, (byte) 100), - Column.ofValues("SomeShorts", (short) 2, Sentinels.NULL_SHORT, (short) 32000), - Column.ofValues("SomeInts", 3, Sentinels.NULL_INT, 2000000000), - Column.ofValues("SomeLongs", 4L, Sentinels.NULL_LONG, 4000000000L)); - - invokeTest(defaultCsvReader() - .setParsers(Parsers.COMPLETE) - .setNullValueLiteralFor(1, "❌") - .setNullValueLiteralFor(2, "🔥") - .setNullValueLiteralFor("SomeInts", "⋰⋱") - .setNullValueLiteralFor("SomeLongs", "𝓓𝓮𝓮𝓹𝓱𝓪𝓿𝓮𝓷"), - input, expected); - } - - /** - * Provide a number of rows larger than ParserBase.DEST_BLOCK_SIZE. - */ - @Test - public void manyRows() throws CsvReaderException { - final StringBuilder sb = new StringBuilder(); - sb.append( - "SomeBooleans,SomeBytes,SomeShorts,SomeInts,SomeLongs,SomeDoubles,SomeStrings,SomeChars,SomeDateTimes,SomeTimestamps\n"); - final TByteArrayList booleans = new TByteArrayList(); - final TByteArrayList bytes = new TByteArrayList(); - final TShortArrayList shorts = new TShortArrayList(); - final TIntArrayList ints = new TIntArrayList(); - final TLongArrayList longs = new TLongArrayList(); - final TDoubleArrayList doubles = new TDoubleArrayList(); - final ArrayList strings = new ArrayList<>(); - final TCharArrayList chars = new TCharArrayList(); - final TLongArrayList dateTimesAsLongs = new TLongArrayList(); - final TLongArrayList timestampsAsLongs = new TLongArrayList(); - final String qq = "qq"; - final long dtl = 799402088000000000L; // 1995-05-02 08:08:08Z - final long tsl = 3456789012L; - // Make sure we have a few more rows than Parser.DEST_BLOCK_SIZE - for (int ii = 0; ii < Parser.CHUNK_SIZE + 3; ++ii) { - sb.append("true,5,6,7,8,1.1,qq,r,1995-05-02 08:08:08Z,3456789012\n"); - booleans.add((byte) 1); - bytes.add((byte) 5); - shorts.add((short) 6); - ints.add(7); - longs.add(8); - doubles.add(1.1); - strings.add(qq); - chars.add('r'); - dateTimesAsLongs.add(dtl); - timestampsAsLongs.add(tsl); - } - // Add a row like this somewhere (let's put it at the end to make things challenging) so inference picks the - // right types. - sb.append("false,100,32000,2000000000,4000000000,6.6e50,yy,z,2020-03-05 12:34:56Z,123456789\n"); - booleans.add((byte) 0); - bytes.add((byte) 100); - shorts.add((short) 32000); - ints.add(2000000000); - longs.add(4000000000L); - doubles.add(6.6e50); - strings.add("yy"); - chars.add('z'); - dateTimesAsLongs.add(1583411696000000000L); // 2020-03-05 12:34:56Z - timestampsAsLongs.add(123456789); - - final String input = sb.toString(); - final ColumnSet expected = ColumnSet.of( - Column.ofArray("SomeBooleans", booleans.toArray()).reinterpret(boolean.class), - Column.ofArray("SomeBytes", bytes.toArray()), - Column.ofArray("SomeShorts", shorts.toArray()), - Column.ofArray("SomeInts", ints.toArray()), - Column.ofArray("SomeLongs", longs.toArray()), - Column.ofArray("SomeDoubles", doubles.toArray()), - Column.ofArray("SomeStrings", strings.toArray(new String[0])), - Column.ofArray("SomeChars", chars.toArray()), - Column.ofArray("SomeDateTimes", dateTimesAsLongs.toArray()).reinterpret(Instant.class), - Column.ofArray("SomeTimestamps", timestampsAsLongs.toArray()).reinterpret(Instant.class)); - invokeTest(defaultCsvReader() - .setParsers(Parsers.COMPLETE) - .setParserFor("SomeTimestamps", Parsers.TIMESTAMP_NANOS), - input, expected); - } - - @Test - public void customParser() throws CsvReaderException { - final String bd1 = - "81290897538197389132106321892137218932178913227138932178912312132.21879213278912387692138723198"; - final String bd2 = - "-9210381027382193791312718239712389127812931236183167913268912683921681293621891236821.12986178632478123678312762318"; - - final String input = "" + - "Index,BigValues\n" + - "0," + bd1 + "\n" + - "1,\n" + - "2," + bd2 + "\n"; - - final ColumnSet expected = ColumnSet.of( - Column.ofValues("Index", 0, 1, 2), - Column.ofRefs("BigValues", new BigDecimal(bd1), null, new BigDecimal(bd2))); - - invokeTest(defaultCsvReader() - .setParserFor(2, new MyBigDecimalParser()), - input, expected); - } - - private static class MyBigDecimalParser implements Parser { - @NotNull - @Override - public ParserContext makeParserContext(GlobalContext gctx, int chunkSize) { - final MyBigDecimalSink sink = new MyBigDecimalSink(); - return new ParserContext<>(sink, null, new BigDecimal[chunkSize]); - } - - @Override - public long tryParse(GlobalContext gctx, ParserContext pctx, IteratorHolder ih, long begin, - long end, boolean appending) throws CsvReaderException { - final boolean[] nulls = gctx.nullChunk(); - - final Sink sink = pctx.sink(); - final BigDecimal[] values = pctx.valueChunk(); - - // Reusable buffer - char[] charData = new char[0]; - - long current = begin; - int chunkIndex = 0; - do { - if (chunkIndex == values.length) { - sink.write(values, nulls, current, current + chunkIndex, appending); - current += chunkIndex; - chunkIndex = 0; - } - if (current + chunkIndex == end) { - break; - } - if (gctx.isNullCell(ih)) { - nulls[chunkIndex++] = true; - continue; - } - final ByteSlice bs = ih.bs(); - if (!RangeTests.isAscii(bs.data(), bs.begin(), bs.end())) { - break; - } - - // Convert bytes to chars. Annoying. - if (charData.length < bs.size()) { - charData = new char[bs.size()]; - } - int destIndex = 0; - for (int cur = bs.begin(); cur != bs.end(); ++cur) { - charData[destIndex++] = (char) bs.data()[cur]; - } - - try { - values[chunkIndex] = new BigDecimal(charData, 0, destIndex); - } catch (NumberFormatException ne) { - break; - } - nulls[chunkIndex] = false; - ++chunkIndex; - } while (ih.tryMoveNext()); - sink.write(values, nulls, current, current + chunkIndex, appending); - return current + chunkIndex; - } - } - - private static class MyBigDecimalSink implements Sink, ColumnProvider { - private final List dest = new ArrayList<>(); - - @Override - public void write(BigDecimal[] src, boolean[] isNull, long destBegin, long destEnd, boolean appending) { - if (destBegin == destEnd) { - return; - } - - final int size = Math.toIntExact(destEnd - destBegin); - if (appending) { - // If the new area starts beyond the end of the destination, pad the destination. - while (dest.size() < destBegin) { - dest.add(null); - } - for (int ii = 0; ii < size; ++ii) { - dest.add(isNull[ii] ? null : src[ii]); - } - return; - } - - final int destBeginAsInt = Math.toIntExact(destBegin); - for (int ii = 0; ii < size; ++ii) { - dest.set(destBeginAsInt + ii, isNull[ii] ? null : src[ii]); - } - } - - @Override - public Column toColumn(final String columnName) { - return Column.ofArray(columnName, dest.toArray(new BigDecimal[0])); - } - } - - private static final class ColumnSet { - public static final ColumnSet NONE = new ColumnSet(new Column[0], 0); - - private final Column[] columns; - private final int columnSize; - - public static ColumnSet of(Column... columns) { - if (columns.length == 0) { - throw new RuntimeException("Empty column set is not permitted"); - } - final int c0Size = columns[0].size(); - for (int ii = 1; ii < columns.length; ++ii) { // Deliberately starting at 1. - final int ciiSize = columns[ii].size(); - if (ciiSize != c0Size) { - throw new RuntimeException( - String.format("Column %d (size %d) has a different size than column 0 (size %d)", - ii, ciiSize, c0Size)); - } - } - return new ColumnSet(columns, c0Size); - } - - private ColumnSet(Column[] columns, int columnSize) { - this.columns = columns; - this.columnSize = columnSize; - } - - @Override - public String toString() { - final StringBuilder sb = new StringBuilder(); - final List> colList = List.of(columns); - - final BiFunction, Class, String> renderType = (etype, rtype) -> { - if (etype == rtype) { - return etype.getCanonicalName(); - } - return etype.getCanonicalName() + "->" + rtype.getCanonicalName(); - }; - - Renderer.renderList(sb, colList, ",", - col -> String.format("%s(%s)", col.name(), - renderType.apply(col.elementType(), col.reinterpretedType))); - for (int jj = 0; jj < columnSize; ++jj) { - final int jjFinal = jj; - sb.append('\n'); - Renderer.renderList(sb, colList, ",", col -> safeToString(col.getItem(jjFinal))); - } - return sb.toString(); - } - - private static String safeToString(Object o) { - return o == null ? "(null)" : o.toString(); - } - } - - private static final class Column { - private final String name; - private final TARRAY values; - private final int size; - private final Class reinterpretedType; - - public static Column ofValues(final String name, final byte... values) { - return ofArray(name, values); - } - - public static Column ofValues(final String name, final short... values) { - return ofArray(name, values); - } - - public static Column ofValues(final String name, final int... values) { - return ofArray(name, values); - } - - public static Column ofValues(final String name, final long... values) { - return ofArray(name, values); - } - - public static Column ofValues(final String name, final float... values) { - return ofArray(name, values); - } - - public static Column ofValues(final String name, final double... values) { - return ofArray(name, values); - } - - public static Column ofValues(final String name, final char... values) { - return ofArray(name, values); - } - - public static Column ofRefs(final String name, final T... values) { - return ofArray(name, values); - } - - public static Column ofArray(final String name, final TARRAY values) { - return new Column<>(name, values); - } - - private Column(final String name, final TARRAY values) { - this(name, values, Array.getLength(values), values.getClass().getComponentType()); - } - - private Column(final String name, final TARRAY values, int size, Class reinterpretedType) { - this.name = name; - this.values = values; - this.size = size; - this.reinterpretedType = reinterpretedType; - } - - public Column reinterpret(Class reinterpretedType) { - return new Column<>(name, values, size, reinterpretedType); - } - - public int size() { - return size; - } - - public String name() { - return name; - } - - public Class elementType() { - return values.getClass().getComponentType(); - } - - public Class reinterpretedType() { - return reinterpretedType; - } - - public Object getItem(int index) { - return Array.get(values, index); - } - } - - private static CsvReader defaultCsvReader() { - return new CsvReader().setIgnoreSurroundingSpaces(true); - } - - private static void invokeTest(CsvReader csvReader, String input, ColumnSet expected) throws CsvReaderException { - final InputStream inputStream = toInputStream(input); - final CsvReader.Result result = parse(csvReader, inputStream); - final ColumnSet actual = toColumnSet(result); - final String expectedToString = expected.toString(); - final String actualToString = actual.toString(); - Assertions.assertThat(actualToString).isEqualTo(expectedToString); - } - - /** - * Parses {@code inputStream} according to the specifications of {@code csvReader}. - * - * @param inputStream the input stream. - * @return The parsed data - * @throws CsvReaderException If any sort of failure occurs. - */ - private static CsvReader.Result parse(CsvReader csvReader, InputStream inputStream) throws CsvReaderException { - return csvReader.read(inputStream, makeMySinkFactory()); - } - - /** - * Convert String to InputStream - */ - private static InputStream toInputStream(final String input) { - final StringReader reader = new StringReader(input); - return new ReaderInputStream(reader, StandardCharsets.UTF_8); - } - - /*** - * Converts the {@link CsvReader.Result} to a {@link ColumnSet}. - */ - private static ColumnSet toColumnSet(final CsvReader.Result result) { - final int numCols = result.numCols(); - - final String[] columnNames = result.columnNames(); - final Sink[] sinks = result.columns(); - final Column[] columns = new Column[numCols]; - for (int ii = 0; ii < numCols; ++ii) { - final String columnName = columnNames[ii]; - final ColumnProvider sink = (ColumnProvider) sinks[ii]; - columns[ii] = sink.toColumn(columnName); - } - return ColumnSet.of(columns); - } - - public interface ColumnProvider { - Column toColumn(final String columnName); - } - - private static abstract class MySinkBase implements Sink, ColumnProvider { - protected final TCOLLECTION collection; - protected int collectionSize; - protected final FillOperation fillOperation; - protected final SetOperation setOperation; - protected final AddOperation addOperation; - protected final BiFunction> toColumnOperation; - - protected MySinkBase(TCOLLECTION collection, FillOperation fillOperation, - SetOperation setOperation, AddOperation addOperation, - BiFunction> toColumnOperation) { - this.collection = collection; - this.fillOperation = fillOperation; - this.setOperation = setOperation; - this.addOperation = addOperation; - this.toColumnOperation = toColumnOperation; - } - - @Override - public final void write(final TARRAY src, final boolean[] isNull, final long destBegin, - final long destEnd, boolean appending) { - if (destBegin == destEnd) { - return; - } - final int size = Math.toIntExact(destEnd - destBegin); - final int destBeginAsInt = Math.toIntExact(destBegin); - final int destEndAsInt = Math.toIntExact(destEnd); - nullFlagsToValues(isNull, src, size); - - if (!appending) { - // Replacing. - setOperation.apply(collection, destBeginAsInt, src, 0, size); - return; - } - - // Appending. First, if the new area starts beyond the end of the destination, pad the destination. - if (collectionSize < destBegin) { - fillOperation.apply(collection, collectionSize, destBeginAsInt); - collectionSize = destBeginAsInt; - } - // Then do the append. - addOperation.apply(collection, src, 0, size); - collectionSize = destEndAsInt; - } - - protected abstract void nullFlagsToValues(final boolean[] isNull, final TARRAY values, final int size); - - public final Column toColumn(final String columnName) { - return toColumnOperation.apply(collection, columnName); - } - - /** - * Meant to be paired with e.g. TDoubleArrayList.fill(int fromIndex, int toIndex, 0.0) - */ - protected interface FillOperation { - void apply(TCOLLECTION coll, int fromIndex, int toIndex); - } - - /** - * Meant to be paired with e.g. TDoubleArrayList.set(int offset, double[] values, int valOffset, int length) - */ - protected interface SetOperation { - void apply(TCOLLECTION coll, int offset, TARRAY values, int vallOffset, int length); - } - - /** - * Meant to be paired with e.g. TDoubleArrayList.add(double[] values, int offset, int length) - */ - protected interface AddOperation { - void apply(TCOLLECTION coll, TARRAY values, int offset, int length); - } - } - - private static abstract class MySourceAndSinkBase extends MySinkBase - implements io.deephaven.csv.sinks.Source, Sink { - private final ToArrayOperation toArrayOperation; - - protected MySourceAndSinkBase(TCOLLECTION collection, FillOperation fillOperation, - SetOperation setOperation, AddOperation addOperation, - BiFunction> toColumnOperation, - ToArrayOperation toArrayOperation) { - super(collection, fillOperation, setOperation, addOperation, toColumnOperation); - this.toArrayOperation = toArrayOperation; - } - - @Override - public void read(TARRAY dest, boolean[] isNull, long srcBegin, long srcEnd) { - if (srcBegin == srcEnd) { - return; - } - final int size = Math.toIntExact(srcEnd - srcBegin); - toArrayOperation.apply(collection, dest, Math.toIntExact(srcBegin), 0, size); - nullValuesToFlags(dest, isNull, size); - } - - protected abstract void nullValuesToFlags(final TARRAY values, final boolean[] isNull, final int size); - - /** - * Meant to be paired with e.g. TDoubleArrayList.add(double[] dest, int source_pos, int dest_pos, int length) - */ - private interface ToArrayOperation { - void apply(TCOLLECTION coll, TARRAY dest, int source_pos_, int dest_pos, int length); - } - } - - private static class MyByteSinkBase extends MySourceAndSinkBase { - protected final byte nullSentinel; - - public MyByteSinkBase(final byte nullSentinel, final Class reinterpretedType) { - super(new TByteArrayList(), - (dest, from, to) -> dest.fill(from, to, (byte) 0), - TByteArrayList::set, - TByteArrayList::add, - (dest, name) -> Column.ofArray(name, dest.toArray()).reinterpret(reinterpretedType), - TByteArrayList::toArray); - this.nullSentinel = nullSentinel; - } - - @Override - protected final void nullFlagsToValues(boolean[] isNull, byte[] values, int size) { - for (int ii = 0; ii < size; ++ii) { - if (isNull[ii]) { - values[ii] = nullSentinel; - } - } - } - - @Override - protected final void nullValuesToFlags(byte[] values, boolean[] isNull, int size) { - for (int ii = 0; ii < size; ++ii) { - isNull[ii] = values[ii] == nullSentinel; - } - } - } - - private static final class MyByteSink extends MyByteSinkBase { - public MyByteSink() { - super(Sentinels.NULL_BYTE, byte.class); - } - } - - private static final class MyShortSink extends MySourceAndSinkBase { - public MyShortSink() { - super(new TShortArrayList(), - (dest, from, to) -> dest.fill(from, to, (short) 0), - TShortArrayList::set, - TShortArrayList::add, - (dest, name) -> Column.ofArray(name, dest.toArray()), - TShortArrayList::toArray); - } - - @Override - protected void nullFlagsToValues(boolean[] isNull, short[] values, int size) { - for (int ii = 0; ii < size; ++ii) { - if (isNull[ii]) { - values[ii] = Sentinels.NULL_SHORT; - } - } - } - - @Override - protected void nullValuesToFlags(short[] values, boolean[] isNull, int size) { - for (int ii = 0; ii < size; ++ii) { - isNull[ii] = values[ii] == Sentinels.NULL_SHORT; - } - } - } - - private static final class MyIntSink extends MySourceAndSinkBase { - public MyIntSink() { - super(new TIntArrayList(), - (dest, from, to) -> dest.fill(from, to, 0), - TIntArrayList::set, - TIntArrayList::add, - (dest, name) -> Column.ofArray(name, dest.toArray()), - TIntArrayList::toArray); - } - - @Override - protected void nullFlagsToValues(boolean[] isNull, int[] values, int size) { - for (int ii = 0; ii < size; ++ii) { - if (isNull[ii]) { - values[ii] = Sentinels.NULL_INT; - } - } - } - - @Override - protected void nullValuesToFlags(int[] values, boolean[] isNull, int size) { - for (int ii = 0; ii < size; ++ii) { - isNull[ii] = values[ii] == Sentinels.NULL_INT; - } - } - } - - private static class MyLongSinkBase extends MySourceAndSinkBase { - private final long nullSentinel; - - public MyLongSinkBase(final long nullSentinel, final Class reinterpretedType) { - super(new TLongArrayList(), - (dest, from, to) -> dest.fill(from, to, 0L), - TLongArrayList::set, - TLongArrayList::add, - (dest, name) -> Column.ofArray(name, dest.toArray()).reinterpret(reinterpretedType), - TLongArrayList::toArray); - this.nullSentinel = nullSentinel; - } - - @Override - protected final void nullFlagsToValues(boolean[] isNull, long[] values, int size) { - for (int ii = 0; ii < size; ++ii) { - if (isNull[ii]) { - values[ii] = nullSentinel; - } - } - } - - @Override - protected final void nullValuesToFlags(long[] values, boolean[] isNull, int size) { - for (int ii = 0; ii < size; ++ii) { - isNull[ii] = values[ii] == nullSentinel; - } - } - } - - private static final class MyLongSink extends MyLongSinkBase { - public MyLongSink() { - super(Sentinels.NULL_LONG, long.class); - } - } - - private static final class MyFloatSink extends MySinkBase { - public MyFloatSink() { - super(new TFloatArrayList(), - (dest, from, to) -> dest.fill(from, to, 0), - TFloatArrayList::set, - TFloatArrayList::add, - (dest, name) -> Column.ofArray(name, dest.toArray())); - } - - @Override - protected void nullFlagsToValues(boolean[] isNull, float[] values, int size) { - for (int ii = 0; ii < size; ++ii) { - if (isNull[ii]) { - values[ii] = Sentinels.NULL_FLOAT; - } - } - } - } - - private static final class MyDoubleSink extends MySinkBase { - public MyDoubleSink() { - super(new TDoubleArrayList(), - (dest, from, to) -> dest.fill(from, to, 0), - TDoubleArrayList::set, - TDoubleArrayList::add, - (dest, name) -> Column.ofArray(name, dest.toArray())); - } - - @Override - protected void nullFlagsToValues(boolean[] isNull, double[] values, int size) { - for (int ii = 0; ii < size; ++ii) { - if (isNull[ii]) { - values[ii] = Sentinels.NULL_DOUBLE; - } - } - } - } - - private static final class MyBooleanAsByteSink extends MyByteSinkBase { - public MyBooleanAsByteSink() { - super(Sentinels.NULL_BOOLEAN_AS_BYTE, boolean.class); - } - } - - private static final class MyCharSink extends MySinkBase { - public MyCharSink() { - super(new TCharArrayList(), - (coll, from, to) -> coll.fill(from, to, (char) 0), - TCharArrayList::set, - TCharArrayList::add, - (dest, name) -> Column.ofArray(name, dest.toArray())); - } - - @Override - protected void nullFlagsToValues(boolean[] isNull, char[] values, int size) { - for (int ii = 0; ii < size; ++ii) { - if (isNull[ii]) { - values[ii] = Sentinels.NULL_CHAR; - } - } - } - } - - private static final class MyStringSink extends MySinkBase, String[]> { - public MyStringSink() { - super(new ArrayList<>(), - MyStringSink::fill, - MyStringSink::set, - MyStringSink::add, - (dest, name) -> Column.ofArray(name, dest.toArray(new String[0]))); - } - - @Override - protected void nullFlagsToValues(boolean[] isNull, String[] values, int size) { - for (int ii = 0; ii < size; ++ii) { - if (isNull[ii]) { - values[ii] = null; - } - } - } - - private static void fill(final ArrayList dest, final int from, final int to) { - for (int current = from; current != to; ++current) { - if (current < dest.size()) { - dest.set(current, null); - } else { - dest.add(null); - } - } - } - - private static void set(final ArrayList dest, final int destOffset, final String[] src, - final int srcOffset, final int size) { - for (int ii = 0; ii < size; ++ii) { - dest.set(destOffset + ii, src[srcOffset + ii]); - } - } - - private static void add(final ArrayList dest, final String[] src, final int srcOffset, - final int size) { - for (int ii = 0; ii < size; ++ii) { - dest.add(src[srcOffset + ii]); - } - } - } - - private static final class MyDateTimeAsLongSink extends MyLongSinkBase { - public MyDateTimeAsLongSink() { - super(Sentinels.NULL_DATETIME_AS_LONG, Instant.class); - } - } - - private static final class MyTimestampAsLongSink extends MyLongSinkBase { - public MyTimestampAsLongSink() { - super(Sentinels.NULL_TIMESTAMP_AS_LONG, Instant.class); - } - } - - private static SinkFactory makeMySinkFactory() { - return SinkFactory.of( - MyByteSink::new, Sentinels.NULL_BYTE, - MyShortSink::new, Sentinels.NULL_SHORT, - MyIntSink::new, Sentinels.NULL_INT, - MyLongSink::new, Sentinels.NULL_LONG, - MyFloatSink::new, Sentinels.NULL_FLOAT, - MyDoubleSink::new, Sentinels.NULL_DOUBLE, - MyBooleanAsByteSink::new, - MyCharSink::new, Sentinels.NULL_CHAR, - MyStringSink::new, null, - MyDateTimeAsLongSink::new, Sentinels.NULL_LONG, - MyTimestampAsLongSink::new, Sentinels.NULL_LONG); - } -} diff --git a/extensions/csv/src/test/java/io/deephaven/csv/CsvTestSuite.java b/extensions/csv/src/test/java/io/deephaven/csv/CsvTestSuite.java index 4e79f4098a2..5663dd058fb 100644 --- a/extensions/csv/src/test/java/io/deephaven/csv/CsvTestSuite.java +++ b/extensions/csv/src/test/java/io/deephaven/csv/CsvTestSuite.java @@ -4,7 +4,7 @@ import org.junit.runners.Suite; @RunWith(Suite.class) -@Suite.SuiteClasses({CsvReaderTest.class, +@Suite.SuiteClasses({ TestCsvTools.class, DeephavenCsvTest.class}) public class CsvTestSuite { diff --git a/extensions/csv/src/test/java/io/deephaven/csv/DeephavenCsvTest.java b/extensions/csv/src/test/java/io/deephaven/csv/DeephavenCsvTest.java index 9b31a79f9e7..f10cc0b6e0e 100644 --- a/extensions/csv/src/test/java/io/deephaven/csv/DeephavenCsvTest.java +++ b/extensions/csv/src/test/java/io/deephaven/csv/DeephavenCsvTest.java @@ -4,9 +4,12 @@ import io.deephaven.engine.table.Table; import io.deephaven.engine.util.TableTools; import io.deephaven.time.DateTime; +import org.apache.commons.io.input.ReaderInputStream; import org.assertj.core.api.Assertions; import org.junit.Test; +import java.io.StringReader; +import java.nio.charset.StandardCharsets; import java.time.LocalDateTime; import java.time.ZoneId; @@ -32,7 +35,9 @@ public void dateTimeCustomTimezone() throws CsvReaderException { } private static void invokeTest(String input, CsvSpecs specs, Table expected) throws CsvReaderException { - final Table actual = specs.parse(input); + final StringReader reader = new StringReader(input); + final ReaderInputStream inputStream = new ReaderInputStream(reader, StandardCharsets.UTF_8); + final Table actual = CsvTools.readCsv(inputStream, specs); final String differences = TableTools.diff(actual, expected, 25); Assertions.assertThat(differences).isEmpty(); } From f7dae459ab8a2c486b5829f013379e02d05029a9 Mon Sep 17 00:00:00 2001 From: Devin Smith Date: Tue, 1 Feb 2022 16:02:40 -0800 Subject: [PATCH 2/7] Add snapshot note --- build.gradle | 1 + 1 file changed, 1 insertion(+) diff --git a/build.gradle b/build.gradle index 037a7012834..c40eeac4d10 100644 --- a/build.gradle +++ b/build.gradle @@ -60,6 +60,7 @@ allprojects { includeGroup 'org.apache.kafka' } } + // TODO: don't check this in maven { url "https://s01.oss.sonatype.org/content/repositories/snapshots/" mavenContent { From b694dee08d2ded5e78cd63724d7320b6fd9103de Mon Sep 17 00:00:00 2001 From: Corey Kosak Date: Tue, 1 Feb 2022 20:08:16 -0500 Subject: [PATCH 3/7] Provide missing timezone functionality --- .../main/java/io/deephaven/csv/CsvTools.java | 111 +++++++++++++++++- 1 file changed, 108 insertions(+), 3 deletions(-) diff --git a/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java b/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java index 422a474f1b0..ee0a418d934 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java @@ -4,6 +4,7 @@ package io.deephaven.csv; +import gnu.trove.map.hash.TIntObjectHashMap; import io.deephaven.base.Procedure; import io.deephaven.chunk.ByteChunk; import io.deephaven.chunk.CharChunk; @@ -20,11 +21,16 @@ import io.deephaven.chunk.WritableLongChunk; import io.deephaven.chunk.WritableShortChunk; import io.deephaven.chunk.attributes.Values; +import io.deephaven.csv.containers.ByteSlice; import io.deephaven.csv.reading.CsvReader; import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.sinks.SinkFactory; import io.deephaven.csv.sinks.Source; +import io.deephaven.csv.tokenization.RangeTests; +import io.deephaven.csv.tokenization.Tokenizer; import io.deephaven.csv.util.CsvReaderException; +import io.deephaven.csv.util.MutableLong; +import io.deephaven.csv.util.MutableObject; import io.deephaven.datastructures.util.CollectionUtil; import io.deephaven.engine.rowset.RowSequence; import io.deephaven.engine.rowset.RowSequenceFactory; @@ -73,6 +79,7 @@ import java.net.URL; import java.nio.file.Path; import java.nio.file.Paths; +import java.time.ZoneId; import java.util.Arrays; import java.util.Collection; import java.util.LinkedHashMap; @@ -192,13 +199,18 @@ public static Table readCsv(String path, CsvSpecs specs) throws CsvReaderExcepti * Creates an in-memory table from {@code stream} by importing CSV data according to the {@code specs}. The * {@code stream} will be closed upon return. * - * @param stream the stream - * @param specs the csv specs - * @return the table + * @param stream The stream + * @param specs The CSV specs. Note that as a special case, if the caller leaves the {@link CsvSpecs#customTimeZoneParser()} + * field unset, we will supply the {@link DeephavenTimeZoneParser} here. + * @return The table. * @throws CsvReaderException If some error occurs. */ @ScriptApi public static Table readCsv(InputStream stream, CsvSpecs specs) throws CsvReaderException { + // Provide the DeephavenTimeZonerParser if an alternate one was not already specified. + if (specs.customTimeZoneParser() == null) { + specs = ImmutableCsvSpecs.builder().from(specs).customTimeZoneParser(new DeephavenTimeZoneParser()).build(); + } final CsvReader.Result result = CsvReader.read(specs, stream, makeMySinkFactory()); final String[] columnNames = result.columnNames(); final Sink[] sinks = result.columns(); @@ -1224,4 +1236,97 @@ private static SinkFactory makeMySinkFactory() { MyDateTimeAsLongSink::new, QueryConstants.NULL_LONG, MyDateTimeAsLongSink::new, QueryConstants.NULL_LONG); } + + private static final class DeephavenTimeZoneParser implements Tokenizer.CustomTimeZoneParser { + private static final String DEEPHAVEN_TZ_PREFIX = "TZ_"; + private static final int MAX_DEEPHAVEN_TZ_LENGTH = 3; + + private final TIntObjectHashMap zoneIdMap = new TIntObjectHashMap<>(); + + private int lastTzKey = -1; + private ZoneId lastZoneId = null; + + public DeephavenTimeZoneParser() { + for (TimeZone zone : TimeZone.values()) { + final String zname = zone.name(); + if (!zname.startsWith(DEEPHAVEN_TZ_PREFIX)) { + throw new RuntimeException("Logic error: unexpected enum in DBTimeZone: " + zname); + } + final String zSuffix = zname.substring(DEEPHAVEN_TZ_PREFIX.length()); + final int zlen = zSuffix.length(); + if (zlen > MAX_DEEPHAVEN_TZ_LENGTH) { + throw new RuntimeException("Logic error: unexpectedly-long enum in DBTimeZone: " + zname); + } + final byte[] data = new byte[zlen]; + for (int ii = 0; ii < zlen; ++ii) { + final char ch = zSuffix.charAt(ii); + if (!RangeTests.isUpper(ch)) { + throw new RuntimeException("Logic error: unexpected character in DBTimeZone name: " + zname); + } + data[ii] = (byte) ch; + } + final ByteSlice bs = new ByteSlice(data, 0, data.length); + final int tzKey = tryParseTzKey(bs); + if (tzKey < 0) { + throw new RuntimeException("Logic error: can't parse DBTimeZone as key: " + zname); + } + final ZoneId zoneId = zone.getTimeZone().toTimeZone().toZoneId(); + zoneIdMap.put(tzKey, zoneId); + } + } + + @Override + public boolean tryParse(ByteSlice bs, MutableObject zoneId, MutableLong offsetSeconds) { + if (bs.size() == 0 || bs.front() != ' ') { + return false; + } + final int savedBegin = bs.begin(); + bs.setBegin(bs.begin() + 1); + final int tzKey = tryParseTzKey(bs); + if (tzKey < 0) { + bs.setBegin(savedBegin); + return false; + } + if (tzKey != lastTzKey) { + final ZoneId res = zoneIdMap.get(tzKey); + if (res == null) { + bs.setBegin(savedBegin); + return false; + } + lastTzKey = tzKey; + lastZoneId = res; + } + zoneId.setValue(lastZoneId); + offsetSeconds.setValue(0); + return true; + } + + /** + * Take up to three uppercase characters from a TimeZone string and pack them into an integer. + * + * @param bs A ByteSlice holding the timezone key. + * @return The characters packed into an int, or -1 if there are too many or too few characters, or if the + * characters are not uppercase ASCII. + */ + private static int tryParseTzKey(final ByteSlice bs) { + int res = 0; + int current; + for (current = bs.begin(); current != bs.end(); ++current) { + if (current - bs.begin() > MAX_DEEPHAVEN_TZ_LENGTH) { + return -1; + } + final char ch = RangeTests.toUpper((char) bs.data()[current]); + if (!RangeTests.isUpper(ch)) { + // If it's some nonalphabetic character + break; + } + res = res * 26 + (ch - 'A'); + } + if (current - bs.begin() == 0) { + return -1; + } + bs.setBegin(current); + return res; + } + } } From 57bb3f80317d459caf8afb8d0e7880f1fb9a13d5 Mon Sep 17 00:00:00 2001 From: Devin Smith Date: Mon, 7 Feb 2022 17:13:46 -0800 Subject: [PATCH 4/7] use deephaven-csv:0.1.0 --- build.gradle | 7 --- extensions/csv/build.gradle | 2 +- .../main/java/io/deephaven/csv/CsvTools.java | 49 +++++++++++-------- .../io/deephaven/csv/DeephavenCsvTest.java | 2 +- 4 files changed, 31 insertions(+), 29 deletions(-) diff --git a/build.gradle b/build.gradle index c40eeac4d10..1b44687f7c8 100644 --- a/build.gradle +++ b/build.gradle @@ -60,13 +60,6 @@ allprojects { includeGroup 'org.apache.kafka' } } - // TODO: don't check this in - maven { - url "https://s01.oss.sonatype.org/content/repositories/snapshots/" - mavenContent { - snapshotsOnly() - } - } } } diff --git a/extensions/csv/build.gradle b/extensions/csv/build.gradle index d76e66784b3..dac964473a0 100644 --- a/extensions/csv/build.gradle +++ b/extensions/csv/build.gradle @@ -6,7 +6,7 @@ description 'CSV: Support to read and write engine tables from/to CSV' dependencies { api project(':engine-api') - api 'io.deephaven:deephaven-csv:0.0.1-SNAPSHOT' + api 'io.deephaven:deephaven-csv:0.1.0' implementation project(':engine-table'), project(':engine-base') diff --git a/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java b/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java index ee0a418d934..cafce43c984 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java @@ -21,6 +21,7 @@ import io.deephaven.chunk.WritableLongChunk; import io.deephaven.chunk.WritableShortChunk; import io.deephaven.chunk.attributes.Values; +import io.deephaven.csv.CsvSpecs.Builder; import io.deephaven.csv.containers.ByteSlice; import io.deephaven.csv.reading.CsvReader; import io.deephaven.csv.sinks.Sink; @@ -28,6 +29,7 @@ import io.deephaven.csv.sinks.Source; import io.deephaven.csv.tokenization.RangeTests; import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.tokenization.Tokenizer.CustomTimeZoneParser; import io.deephaven.csv.util.CsvReaderException; import io.deephaven.csv.util.MutableLong; import io.deephaven.csv.util.MutableObject; @@ -97,6 +99,15 @@ public class CsvTools { public final static boolean NULLS_AS_EMPTY_DEFAULT = true; + /** + * Creates a {@link Builder} with {@link CsvTools}-specific values. Includes a {@link CustomTimeZoneParser}. + * + * @return the builder + */ + public static Builder builder() { + return CsvSpecs.builder().customTimeZoneParser(new DeephavenTimeZoneParser()); + } + /** * Creates an in-memory table from {@code path} by importing CSV data. * @@ -114,7 +125,7 @@ public class CsvTools { */ @ScriptApi public static Table readCsv(String path) throws CsvReaderException { - return readCsv(path, CsvSpecs.csv()); + return readCsv(path, builder().build()); } /** @@ -128,7 +139,7 @@ public static Table readCsv(String path) throws CsvReaderException { */ @ScriptApi public static Table readCsv(InputStream stream) throws CsvReaderException { - return readCsv(stream, CsvSpecs.csv()); + return readCsv(stream, builder().build()); } /** @@ -141,7 +152,7 @@ public static Table readCsv(InputStream stream) throws CsvReaderException { */ @ScriptApi public static Table readCsv(URL url) throws CsvReaderException { - return readCsv(url, CsvSpecs.csv()); + return readCsv(url, builder().build()); } /** @@ -158,7 +169,7 @@ public static Table readCsv(URL url) throws CsvReaderException { */ @ScriptApi public static Table readCsv(Path path) throws CsvReaderException { - return readCsv(path, CsvSpecs.csv()); + return readCsv(path, builder().build()); } /** @@ -200,17 +211,12 @@ public static Table readCsv(String path, CsvSpecs specs) throws CsvReaderExcepti * {@code stream} will be closed upon return. * * @param stream The stream - * @param specs The CSV specs. Note that as a special case, if the caller leaves the {@link CsvSpecs#customTimeZoneParser()} - * field unset, we will supply the {@link DeephavenTimeZoneParser} here. + * @param specs The CSV specs. * @return The table. * @throws CsvReaderException If some error occurs. */ @ScriptApi public static Table readCsv(InputStream stream, CsvSpecs specs) throws CsvReaderException { - // Provide the DeephavenTimeZonerParser if an alternate one was not already specified. - if (specs.customTimeZoneParser() == null) { - specs = ImmutableCsvSpecs.builder().from(specs).customTimeZoneParser(new DeephavenTimeZoneParser()).build(); - } final CsvReader.Result result = CsvReader.read(specs, stream, makeMySinkFactory()); final String[] columnNames = result.columnNames(); final Sink[] sinks = result.columns(); @@ -299,20 +305,22 @@ public static MatchPair[] renamesForHeaderless(String... columnNames) { /** * Equivalent to - * {@code CsvTools.readCsv(filePath, CsvSpecs.headerless()).renameColumns(renamesForHeaderless(columnNames));} + * {@code CsvTools.readCsv(filePath, CsvTools.builder().hasHeaderRow(false).build()).renameColumns(renamesForHeaderless(columnNames));} */ @ScriptApi public static Table readHeaderlessCsv(String filePath, Collection columnNames) throws CsvReaderException { - return CsvTools.readCsv(filePath, CsvSpecs.headerless()).renameColumns(renamesForHeaderless(columnNames)); + return CsvTools.readCsv(filePath, builder().hasHeaderRow(false).build()) + .renameColumns(renamesForHeaderless(columnNames)); } /** * Equivalent to - * {@code CsvTools.readCsv(filePath, CsvSpecs.headerless()).renameColumns(renamesForHeaderless(columnNames));} + * {@code CsvTools.readCsv(filePath, CsvTools.builder().hasHeaderRow(false).build()).renameColumns(renamesForHeaderless(columnNames));} */ @ScriptApi public static Table readHeaderlessCsv(String filePath, String... columnNames) throws CsvReaderException { - return CsvTools.readCsv(filePath, CsvSpecs.headerless()).renameColumns(renamesForHeaderless(columnNames)); + return CsvTools.readCsv(filePath, builder().hasHeaderRow(false).build()) + .renameColumns(renamesForHeaderless(columnNames)); } /** @@ -349,7 +357,7 @@ public static Table readCsv(InputStream is, final String format) throws CsvReade @ScriptApi @Deprecated public static Table readCsv(InputStream is, final char separator) throws CsvReaderException { - return readCsv(is, CsvSpecs.builder().delimiter(separator).build()); + return readCsv(is, builder().delimiter(separator).build()); } private static boolean isStandardFile(URL url) { @@ -951,16 +959,17 @@ private static void writeCsvContentsSeq( } public static CsvSpecs fromLegacyFormat(String format) { + final Builder builder = builder(); if (format == null) { - return CsvSpecs.csv(); + return builder.build(); } else if (format.length() == 1) { - return CsvSpecs.builder().delimiter(format.charAt(0)).build(); + return builder.delimiter(format.charAt(0)).build(); } else if ("TRIM".equals(format)) { - return CsvSpecs.builder().trim(true).build(); + return builder.trim(true).build(); } else if ("DEFAULT".equals(format)) { - return CsvSpecs.builder().ignoreSurroundingSpaces(false).build(); + return builder.ignoreSurroundingSpaces(false).build(); } else if ("TDF".equals(format)) { - return CsvSpecs.tsv(); + return builder.delimiter('\t').build(); } return null; } diff --git a/extensions/csv/src/test/java/io/deephaven/csv/DeephavenCsvTest.java b/extensions/csv/src/test/java/io/deephaven/csv/DeephavenCsvTest.java index f10cc0b6e0e..c75f401c777 100644 --- a/extensions/csv/src/test/java/io/deephaven/csv/DeephavenCsvTest.java +++ b/extensions/csv/src/test/java/io/deephaven/csv/DeephavenCsvTest.java @@ -31,7 +31,7 @@ public void dateTimeCustomTimezone() throws CsvReaderException { final Table expected = TableTools.newTable( TableTools.col("Timestamp", DATETIME_A, null, DATETIME_B)); - invokeTest(input, CsvSpecs.csv(), expected); + invokeTest(input, CsvTools.builder().build(), expected); } private static void invokeTest(String input, CsvSpecs specs, Table expected) throws CsvReaderException { From 6e734a95b4cc20452c6093c6141c02ceae382744 Mon Sep 17 00:00:00 2001 From: Devin Smith Date: Tue, 8 Feb 2022 08:02:57 -0800 Subject: [PATCH 5/7] Setup column legalizers. Extract time zone map into static. --- .../io/deephaven/csv/ColumnNameLegalizer.java | 37 ++++++ .../main/java/io/deephaven/csv/CsvTools.java | 111 ++---------------- .../csv/DeephavenTimeZoneParser.java | 110 +++++++++++++++++ 3 files changed, 157 insertions(+), 101 deletions(-) create mode 100644 extensions/csv/src/main/java/io/deephaven/csv/ColumnNameLegalizer.java create mode 100644 extensions/csv/src/main/java/io/deephaven/csv/DeephavenTimeZoneParser.java diff --git a/extensions/csv/src/main/java/io/deephaven/csv/ColumnNameLegalizer.java b/extensions/csv/src/main/java/io/deephaven/csv/ColumnNameLegalizer.java new file mode 100644 index 00000000000..b3433ce6340 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/ColumnNameLegalizer.java @@ -0,0 +1,37 @@ +package io.deephaven.csv; + +import io.deephaven.api.util.NameValidator; +import io.deephaven.csv.CsvSpecs.Builder; + +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +/** + * A {@link Builder#headerLegalizer(Function)} that replaces {@code '-'} and {@code ' '} with {@code '_'}. Also + * implements {@link Builder#headerValidator(Predicate)}. + */ +public enum ColumnNameLegalizer implements Function, Predicate { + INSTANCE; + + private final Pattern pattern; + + ColumnNameLegalizer() { + this.pattern = Pattern.compile("[- ]"); + } + + private String replace(String columnName) { + return pattern.matcher(columnName).replaceAll("_"); + } + + @Override + public String[] apply(String[] columnNames) { + return NameValidator.legalizeColumnNames(columnNames, this::replace, true); + } + + + @Override + public boolean test(String columnName) { + return NameValidator.isValidColumnName(columnName); + } +} diff --git a/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java b/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java index cafce43c984..f52e30ca59f 100644 --- a/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java +++ b/extensions/csv/src/main/java/io/deephaven/csv/CsvTools.java @@ -4,7 +4,7 @@ package io.deephaven.csv; -import gnu.trove.map.hash.TIntObjectHashMap; +import io.deephaven.api.util.NameValidator; import io.deephaven.base.Procedure; import io.deephaven.chunk.ByteChunk; import io.deephaven.chunk.CharChunk; @@ -22,17 +22,12 @@ import io.deephaven.chunk.WritableShortChunk; import io.deephaven.chunk.attributes.Values; import io.deephaven.csv.CsvSpecs.Builder; -import io.deephaven.csv.containers.ByteSlice; import io.deephaven.csv.reading.CsvReader; import io.deephaven.csv.sinks.Sink; import io.deephaven.csv.sinks.SinkFactory; import io.deephaven.csv.sinks.Source; -import io.deephaven.csv.tokenization.RangeTests; -import io.deephaven.csv.tokenization.Tokenizer; import io.deephaven.csv.tokenization.Tokenizer.CustomTimeZoneParser; import io.deephaven.csv.util.CsvReaderException; -import io.deephaven.csv.util.MutableLong; -import io.deephaven.csv.util.MutableObject; import io.deephaven.datastructures.util.CollectionUtil; import io.deephaven.engine.rowset.RowSequence; import io.deephaven.engine.rowset.RowSequenceFactory; @@ -81,12 +76,13 @@ import java.net.URL; import java.nio.file.Path; import java.nio.file.Paths; -import java.time.ZoneId; import java.util.Arrays; import java.util.Collection; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.function.Function; +import java.util.function.Predicate; /** * Utilities for reading and writing CSV files to and from {@link Table}s @@ -100,12 +96,17 @@ public class CsvTools { public final static boolean NULLS_AS_EMPTY_DEFAULT = true; /** - * Creates a {@link Builder} with {@link CsvTools}-specific values. Includes a {@link CustomTimeZoneParser}. + * Creates a {@link Builder} with {@link CsvTools}-specific values. Sets {@link ColumnNameLegalizer#INSTANCE} as + * {@link Builder#headerLegalizer(Function)} and {@link Builder#headerValidator(Predicate)}; sets a new instance of + * {@link DeephavenTimeZoneParser} as {@link Builder#customTimeZoneParser(CustomTimeZoneParser)}. * * @return the builder */ public static Builder builder() { - return CsvSpecs.builder().customTimeZoneParser(new DeephavenTimeZoneParser()); + return CsvSpecs.builder() + .headerLegalizer(ColumnNameLegalizer.INSTANCE) + .headerValidator(ColumnNameLegalizer.INSTANCE) + .customTimeZoneParser(new DeephavenTimeZoneParser()); } /** @@ -1246,96 +1247,4 @@ private static SinkFactory makeMySinkFactory() { MyDateTimeAsLongSink::new, QueryConstants.NULL_LONG); } - private static final class DeephavenTimeZoneParser implements Tokenizer.CustomTimeZoneParser { - private static final String DEEPHAVEN_TZ_PREFIX = "TZ_"; - private static final int MAX_DEEPHAVEN_TZ_LENGTH = 3; - - private final TIntObjectHashMap zoneIdMap = new TIntObjectHashMap<>(); - - private int lastTzKey = -1; - private ZoneId lastZoneId = null; - - public DeephavenTimeZoneParser() { - for (TimeZone zone : TimeZone.values()) { - final String zname = zone.name(); - if (!zname.startsWith(DEEPHAVEN_TZ_PREFIX)) { - throw new RuntimeException("Logic error: unexpected enum in DBTimeZone: " + zname); - } - final String zSuffix = zname.substring(DEEPHAVEN_TZ_PREFIX.length()); - final int zlen = zSuffix.length(); - if (zlen > MAX_DEEPHAVEN_TZ_LENGTH) { - throw new RuntimeException("Logic error: unexpectedly-long enum in DBTimeZone: " + zname); - } - final byte[] data = new byte[zlen]; - for (int ii = 0; ii < zlen; ++ii) { - final char ch = zSuffix.charAt(ii); - if (!RangeTests.isUpper(ch)) { - throw new RuntimeException("Logic error: unexpected character in DBTimeZone name: " + zname); - } - data[ii] = (byte) ch; - } - final ByteSlice bs = new ByteSlice(data, 0, data.length); - final int tzKey = tryParseTzKey(bs); - if (tzKey < 0) { - throw new RuntimeException("Logic error: can't parse DBTimeZone as key: " + zname); - } - final ZoneId zoneId = zone.getTimeZone().toTimeZone().toZoneId(); - zoneIdMap.put(tzKey, zoneId); - } - } - - @Override - public boolean tryParse(ByteSlice bs, MutableObject zoneId, MutableLong offsetSeconds) { - if (bs.size() == 0 || bs.front() != ' ') { - return false; - } - final int savedBegin = bs.begin(); - bs.setBegin(bs.begin() + 1); - final int tzKey = tryParseTzKey(bs); - if (tzKey < 0) { - bs.setBegin(savedBegin); - return false; - } - if (tzKey != lastTzKey) { - final ZoneId res = zoneIdMap.get(tzKey); - if (res == null) { - bs.setBegin(savedBegin); - return false; - } - lastTzKey = tzKey; - lastZoneId = res; - } - zoneId.setValue(lastZoneId); - offsetSeconds.setValue(0); - return true; - } - - /** - * Take up to three uppercase characters from a TimeZone string and pack them into an integer. - * - * @param bs A ByteSlice holding the timezone key. - * @return The characters packed into an int, or -1 if there are too many or too few characters, or if the - * characters are not uppercase ASCII. - */ - private static int tryParseTzKey(final ByteSlice bs) { - int res = 0; - int current; - for (current = bs.begin(); current != bs.end(); ++current) { - if (current - bs.begin() > MAX_DEEPHAVEN_TZ_LENGTH) { - return -1; - } - final char ch = RangeTests.toUpper((char) bs.data()[current]); - if (!RangeTests.isUpper(ch)) { - // If it's some nonalphabetic character - break; - } - res = res * 26 + (ch - 'A'); - } - if (current - bs.begin() == 0) { - return -1; - } - bs.setBegin(current); - return res; - } - } } diff --git a/extensions/csv/src/main/java/io/deephaven/csv/DeephavenTimeZoneParser.java b/extensions/csv/src/main/java/io/deephaven/csv/DeephavenTimeZoneParser.java new file mode 100644 index 00000000000..699a0c95018 --- /dev/null +++ b/extensions/csv/src/main/java/io/deephaven/csv/DeephavenTimeZoneParser.java @@ -0,0 +1,110 @@ +package io.deephaven.csv; + +import gnu.trove.map.hash.TIntObjectHashMap; +import io.deephaven.csv.containers.ByteSlice; +import io.deephaven.csv.tokenization.RangeTests; +import io.deephaven.csv.tokenization.Tokenizer; +import io.deephaven.csv.util.MutableLong; +import io.deephaven.csv.util.MutableObject; +import io.deephaven.time.TimeZone; + +import java.time.ZoneId; + + +public final class DeephavenTimeZoneParser implements Tokenizer.CustomTimeZoneParser { + private static final String DEEPHAVEN_TZ_PREFIX = "TZ_"; + private static final int MAX_DEEPHAVEN_TZ_LENGTH = 3; + private static final TIntObjectHashMap ZONE_ID_MAP = createZoneIdMap(); + + private int lastTzKey = -1; + private ZoneId lastZoneId = null; + + public DeephavenTimeZoneParser() { + + } + + @Override + public boolean tryParse(ByteSlice bs, MutableObject zoneId, MutableLong offsetSeconds) { + if (bs.size() == 0 || bs.front() != ' ') { + return false; + } + final int savedBegin = bs.begin(); + bs.setBegin(bs.begin() + 1); + final int tzKey = tryParseTzKey(bs); + if (tzKey < 0) { + bs.setBegin(savedBegin); + return false; + } + if (tzKey != lastTzKey) { + final ZoneId res = ZONE_ID_MAP.get(tzKey); + if (res == null) { + bs.setBegin(savedBegin); + return false; + } + lastTzKey = tzKey; + lastZoneId = res; + } + zoneId.setValue(lastZoneId); + offsetSeconds.setValue(0); + return true; + } + + /** + * Take up to three uppercase characters from a TimeZone string and pack them into an integer. + * + * @param bs A ByteSlice holding the timezone key. + * @return The characters packed into an int, or -1 if there are too many or too few characters, or if the + * characters are not uppercase ASCII. + */ + private static int tryParseTzKey(final ByteSlice bs) { + int res = 0; + int current; + for (current = bs.begin(); current != bs.end(); ++current) { + if (current - bs.begin() > MAX_DEEPHAVEN_TZ_LENGTH) { + return -1; + } + final char ch = RangeTests.toUpper((char) bs.data()[current]); + if (!RangeTests.isUpper(ch)) { + // If it's some nonalphabetic character + break; + } + res = res * 26 + (ch - 'A'); + } + if (current - bs.begin() == 0) { + return -1; + } + bs.setBegin(current); + return res; + } + + private static TIntObjectHashMap createZoneIdMap() { + final TIntObjectHashMap zoneIdMap = new TIntObjectHashMap<>(); + for (TimeZone zone : TimeZone.values()) { + final String zname = zone.name(); + if (!zname.startsWith(DEEPHAVEN_TZ_PREFIX)) { + throw new RuntimeException("Logic error: unexpected enum in DBTimeZone: " + zname); + } + final String zSuffix = zname.substring(DEEPHAVEN_TZ_PREFIX.length()); + final int zlen = zSuffix.length(); + if (zlen > MAX_DEEPHAVEN_TZ_LENGTH) { + throw new RuntimeException("Logic error: unexpectedly-long enum in DBTimeZone: " + zname); + } + final byte[] data = new byte[zlen]; + for (int ii = 0; ii < zlen; ++ii) { + final char ch = zSuffix.charAt(ii); + if (!RangeTests.isUpper(ch)) { + throw new RuntimeException("Logic error: unexpected character in DBTimeZone name: " + zname); + } + data[ii] = (byte) ch; + } + final ByteSlice bs = new ByteSlice(data, 0, data.length); + final int tzKey = tryParseTzKey(bs); + if (tzKey < 0) { + throw new RuntimeException("Logic error: can't parse DBTimeZone as key: " + zname); + } + final ZoneId zoneId = zone.getTimeZone().toTimeZone().toZoneId(); + zoneIdMap.put(tzKey, zoneId); + } + return zoneIdMap; + } +} From 3d9b1bae746e5eb2561e078cf38a20484ca8a609 Mon Sep 17 00:00:00 2001 From: Devin Smith Date: Tue, 8 Feb 2022 10:54:10 -0800 Subject: [PATCH 6/7] Update python csv to use new deephaven-csv --- Integrations/python/deephaven/csv.py | 89 +++++++++------------------- pyintegration/deephaven2/csv.py | 72 ++++++++-------------- 2 files changed, 51 insertions(+), 110 deletions(-) diff --git a/Integrations/python/deephaven/csv.py b/Integrations/python/deephaven/csv.py index c770beb544c..bc884858d76 100644 --- a/Integrations/python/deephaven/csv.py +++ b/Integrations/python/deephaven/csv.py @@ -10,38 +10,13 @@ import jpy import wrapt -from deephaven.Types import DataType +import deephaven.Types as dh _JCsvHelpers = None -_JCsvSpecs = None -_JInferenceSpecs = None _JTableHeader = None _JCsvTools = None - - -INFERENCE_STRINGS = None -""" The order of parsing: STRING, INSTANT, SHORT, INT, LONG, DOUBLE, BOOL, CHAR, BYTE, FLOAT. -The parsers after STRING are only relevant when a specific column data type is given. -""" - -INFERENCE_MINIMAL = None -""" The order of parsing: INSTANT, LONG, DOUBLE, BOOL, STRING, BYTE, SHORT, INT, FLOAT, CHAR. -The parsers after STRING are only relevant when a specific column data type is given. -""" - -INFERENCE_STANDARD = None -""" The order of parsing: INSTANT, SHORT, INT, LONG, DOUBLE, BOOL, CHAR, STRING, BYTE, FLOAT. -The parsers after STRING are only relevant when a specific column data type is given. -""" - -INFERENCE_STANDARD_TIMES = None -""" The order of parsing: INSTANT, INSTANT_LEGACY, SECONDS, MILLISECONDS, MICROSECONDS, NANOSECONDS, SHORT, INT, -LONG, DOUBLE, BOOL, CHAR, STRING, BYTE, FLOAT. - -For values that can be parsed as SECONDS/MILLISECONDS/MICROSECONDS/NANOSECONDS, they must be within the 21 century. - -The parsers after STRING are only relevant when a specific column data type is given. -""" +_JParsers = None +_JArrays = None def _defineSymbols(): @@ -54,21 +29,16 @@ def _defineSymbols(): if not jpy.has_jvm(): raise SystemError("No java functionality can be used until the JVM has been initialized through the jpy module") - global _JCsvHelpers, _JCsvSpecs, _JInferenceSpecs, _JTableHeader, _JCsvTools, \ - INFERENCE_STRINGS, INFERENCE_MINIMAL, INFERENCE_STANDARD, INFERENCE_STANDARD_TIMES + global _JCsvHelpers, _JTableHeader, _JCsvTools, _JParsers, _JArrays if _JCsvHelpers is None: # This will raise an exception if the desired object is not the classpath _JCsvHelpers = jpy.get_type("io.deephaven.csv.CsvTools") - _JCsvSpecs = jpy.get_type("io.deephaven.csv.CsvSpecs") - _JInferenceSpecs = jpy.get_type("io.deephaven.csv.InferenceSpecs") _JTableHeader = jpy.get_type("io.deephaven.qst.table.TableHeader") _JCsvTools = jpy.get_type("io.deephaven.csv.CsvTools") + _JParsers = jpy.get_type("io.deephaven.csv.parsers.Parsers") + _JArrays = jpy.get_type("java.util.Arrays") - INFERENCE_STRINGS = _JInferenceSpecs.strings() - INFERENCE_MINIMAL = _JInferenceSpecs.minimal() - INFERENCE_STANDARD = _JInferenceSpecs.standard() - INFERENCE_STANDARD_TIMES = _JInferenceSpecs.standardTimes() # every module method should be decorated with @_passThrough @wrapt.decorator @@ -87,22 +57,9 @@ def _passThrough(wrapped, instance, args, kwargs): return wrapped(*args, **kwargs) -@_passThrough -def _build_header(header: Dict[str, DataType] = None): - if not header: - return None - - table_header_builder = _JTableHeader.builder() - for k, v in header.items(): - table_header_builder.putHeaders(k, v) - - return table_header_builder.build() - - @_passThrough def read(path: str, - header: Dict[str, DataType] = None, - inference: Any = None, + header: Dict[str, dh.DataType] = None, headless: bool = False, delimiter: str = ",", quote: str = "\"", @@ -114,7 +71,6 @@ def read(path: str, Args: path (str): a file path or a URL string header (Dict[str, DataType]): a dict to define the table columns with key being the name, value being the data type - inference (csv.Inference): an Enum value specifying the rules for data type inference, default is INFERENCE_STANDARD headless (bool): indicates if the CSV data is headless, default is False delimiter (str): the delimiter used by the CSV, default is the comma quote (str): the quote character for the CSV, default is double quote @@ -130,17 +86,26 @@ def read(path: str, Exception """ - if inference is None: - inference = INFERENCE_STANDARD - - csv_specs_builder = _JCsvSpecs.builder() - - # build the head spec - table_header = _build_header(header) - if table_header: - csv_specs_builder.header(table_header) - - csv_specs = (csv_specs_builder.inference(inference) + csv_specs_builder = _JCsvTools.builder() + + if header: + csv_specs_builder.headers(_JArrays.asList(list(header.keys()))) + parser_map = { + dh.bool_ : _JParsers.BOOLEAN, + dh.byte : _JParsers.BYTE, + dh.char : _JParsers.CHAR, + dh.short : _JParsers.SHORT, + dh.int_ : _JParsers.INT, + dh.long_ : _JParsers.LONG, + dh.float_ : _JParsers.FLOAT_FAST, + dh.double : _JParsers.DOUBLE, + dh.string : _JParsers.STRING, + dh.datetime : _JParsers.DATETIME + } + for column_name, column_type in header.items(): + csv_specs_builder.putParserForName(column_name, parser_map[column_type]) + + csv_specs = (csv_specs_builder .hasHeaderRow(not headless) .delimiter(ord(delimiter)) .quote(ord(quote)) diff --git a/pyintegration/deephaven2/csv.py b/pyintegration/deephaven2/csv.py index 44854e153ba..e5f0e3c6393 100644 --- a/pyintegration/deephaven2/csv.py +++ b/pyintegration/deephaven2/csv.py @@ -10,51 +10,16 @@ import jpy from deephaven2 import DHError -from deephaven2.dtypes import DType +import deephaven2.dtypes as dh from deephaven2.table import Table -_JCsvSpecs = jpy.get_type("io.deephaven.csv.CsvSpecs") -_JInferenceSpecs = jpy.get_type("io.deephaven.csv.InferenceSpecs") -_JTableHeader = jpy.get_type("io.deephaven.qst.table.TableHeader") _JCsvTools = jpy.get_type("io.deephaven.csv.CsvTools") - - -class Inference(Enum): - """ An Enum of predefined inference specs. - - Inference specifications contains the configuration and logic for inferring an acceptable parser from string values. - """ - - STRINGS = _JInferenceSpecs.strings() - """ Configured parsers: strings only. - """ - - MINIMAL = _JInferenceSpecs.minimal() - """ Configured parsers: BOOL, LONG, DOUBLE, INSTANT, STRING. - """ - - STANDARD = _JInferenceSpecs.standard() - """ Configured parsers: BOOL, INT, LONG, DOUBLE, DATETIME, CHAR, STRING. - """ - - STANDARD_TIMES = _JInferenceSpecs.standardTimes() - """ Configured parsers: BOOL, DATETIME, CHAR, STRING, SECONDS. - """ - -def _build_header(header: Dict[str, DType] = None): - if not header: - return None - - table_header_builder = _JTableHeader.builder() - for k, v in header.items(): - table_header_builder.putHeaders(k, v.qst_type) - - return table_header_builder.build() +_JParsers = jpy.get_type("io.deephaven.csv.parsers.Parsers") +_JArrays = jpy.get_type("java.util.Arrays") def read(path: str, - header: Dict[str, DType] = None, - inference: Any = Inference.STANDARD, + header: Dict[str, dh.DType] = None, headless: bool = False, delimiter: str = ",", quote: str = "\"", @@ -65,7 +30,6 @@ def read(path: str, Args: path (str): a file path or a URL string header (Dict[str, DType]): a dict to define the table columns with key being the name, value being the data type - inference (csv.Inference): an Enum value specifying the rules for data type inference, default is STANDARD_TIMES headless (bool): indicates if the CSV data is headless, default is False delimiter (str): the delimiter used by the CSV, default is the comma quote (str): the quote character for the CSV, default is double quote @@ -80,14 +44,26 @@ def read(path: str, DHError """ try: - csv_specs_builder = _JCsvSpecs.builder() - - # build the head spec - table_header = _build_header(header) - if table_header: - csv_specs_builder.header(table_header) - - csv_specs = (csv_specs_builder.inference(inference.value) + csv_specs_builder = _JCsvTools.builder() + + if header: + csv_specs_builder.headers(_JArrays.asList(list(header.keys()))) + parser_map = { + dh.bool_ : _JParsers.BOOLEAN, + dh.byte : _JParsers.BYTE, + dh.char : _JParsers.CHAR, + dh.short : _JParsers.SHORT, + dh.int_ : _JParsers.INT, + dh.long : _JParsers.LONG, + dh.float_ : _JParsers.FLOAT_FAST, + dh.double : _JParsers.DOUBLE, + dh.string : _JParsers.STRING, + dh.DateTime : _JParsers.DATETIME + } + for column_name, column_type in header.items(): + csv_specs_builder.putParserForName(column_name, parser_map[column_type]) + + csv_specs = (csv_specs_builder .hasHeaderRow(not headless) .delimiter(ord(delimiter)) .quote(ord(quote)) From 2c9d49f18530c593f5d28a4d2713b4e8a640c36d Mon Sep 17 00:00:00 2001 From: Devin Smith Date: Tue, 8 Feb 2022 10:57:48 -0800 Subject: [PATCH 7/7] change to dht --- Integrations/python/deephaven/csv.py | 24 ++++++++++++------------ pyintegration/deephaven2/csv.py | 24 ++++++++++++------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Integrations/python/deephaven/csv.py b/Integrations/python/deephaven/csv.py index bc884858d76..acf01578afa 100644 --- a/Integrations/python/deephaven/csv.py +++ b/Integrations/python/deephaven/csv.py @@ -10,7 +10,7 @@ import jpy import wrapt -import deephaven.Types as dh +import deephaven.Types as dht _JCsvHelpers = None _JTableHeader = None @@ -59,7 +59,7 @@ def _passThrough(wrapped, instance, args, kwargs): @_passThrough def read(path: str, - header: Dict[str, dh.DataType] = None, + header: Dict[str, dht.DataType] = None, headless: bool = False, delimiter: str = ",", quote: str = "\"", @@ -91,16 +91,16 @@ def read(path: str, if header: csv_specs_builder.headers(_JArrays.asList(list(header.keys()))) parser_map = { - dh.bool_ : _JParsers.BOOLEAN, - dh.byte : _JParsers.BYTE, - dh.char : _JParsers.CHAR, - dh.short : _JParsers.SHORT, - dh.int_ : _JParsers.INT, - dh.long_ : _JParsers.LONG, - dh.float_ : _JParsers.FLOAT_FAST, - dh.double : _JParsers.DOUBLE, - dh.string : _JParsers.STRING, - dh.datetime : _JParsers.DATETIME + dht.bool_ : _JParsers.BOOLEAN, + dht.byte : _JParsers.BYTE, + dht.char : _JParsers.CHAR, + dht.short : _JParsers.SHORT, + dht.int_ : _JParsers.INT, + dht.long_ : _JParsers.LONG, + dht.float_ : _JParsers.FLOAT_FAST, + dht.double : _JParsers.DOUBLE, + dht.string : _JParsers.STRING, + dht.datetime : _JParsers.DATETIME } for column_name, column_type in header.items(): csv_specs_builder.putParserForName(column_name, parser_map[column_type]) diff --git a/pyintegration/deephaven2/csv.py b/pyintegration/deephaven2/csv.py index e5f0e3c6393..a1eb5463148 100644 --- a/pyintegration/deephaven2/csv.py +++ b/pyintegration/deephaven2/csv.py @@ -10,7 +10,7 @@ import jpy from deephaven2 import DHError -import deephaven2.dtypes as dh +import deephaven2.dtypes as dht from deephaven2.table import Table _JCsvTools = jpy.get_type("io.deephaven.csv.CsvTools") @@ -19,7 +19,7 @@ def read(path: str, - header: Dict[str, dh.DType] = None, + header: Dict[str, dht.DType] = None, headless: bool = False, delimiter: str = ",", quote: str = "\"", @@ -49,16 +49,16 @@ def read(path: str, if header: csv_specs_builder.headers(_JArrays.asList(list(header.keys()))) parser_map = { - dh.bool_ : _JParsers.BOOLEAN, - dh.byte : _JParsers.BYTE, - dh.char : _JParsers.CHAR, - dh.short : _JParsers.SHORT, - dh.int_ : _JParsers.INT, - dh.long : _JParsers.LONG, - dh.float_ : _JParsers.FLOAT_FAST, - dh.double : _JParsers.DOUBLE, - dh.string : _JParsers.STRING, - dh.DateTime : _JParsers.DATETIME + dht.bool_ : _JParsers.BOOLEAN, + dht.byte : _JParsers.BYTE, + dht.char : _JParsers.CHAR, + dht.short : _JParsers.SHORT, + dht.int_ : _JParsers.INT, + dht.long : _JParsers.LONG, + dht.float_ : _JParsers.FLOAT_FAST, + dht.double : _JParsers.DOUBLE, + dht.string : _JParsers.STRING, + dht.DateTime : _JParsers.DATETIME } for column_name, column_type in header.items(): csv_specs_builder.putParserForName(column_name, parser_map[column_type])