Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use deephaven-csv #1920

Merged
merged 8 commits into from
Feb 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 27 additions & 62 deletions Integrations/python/deephaven/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,38 +10,13 @@
import jpy
import wrapt

from deephaven.Types import DataType
import deephaven.Types as dht

_JCsvHelpers = None
_JCsvSpecs = None
_JInferenceSpecs = None
_JTableHeader = None
_JCsvTools = None


INFERENCE_STRINGS = None
""" The order of parsing: STRING, INSTANT, SHORT, INT, LONG, DOUBLE, BOOL, CHAR, BYTE, FLOAT.
The parsers after STRING are only relevant when a specific column data type is given.
"""

INFERENCE_MINIMAL = None
""" The order of parsing: INSTANT, LONG, DOUBLE, BOOL, STRING, BYTE, SHORT, INT, FLOAT, CHAR.
The parsers after STRING are only relevant when a specific column data type is given.
"""

INFERENCE_STANDARD = None
""" The order of parsing: INSTANT, SHORT, INT, LONG, DOUBLE, BOOL, CHAR, STRING, BYTE, FLOAT.
The parsers after STRING are only relevant when a specific column data type is given.
"""

INFERENCE_STANDARD_TIMES = None
""" The order of parsing: INSTANT, INSTANT_LEGACY, SECONDS, MILLISECONDS, MICROSECONDS, NANOSECONDS, SHORT, INT,
LONG, DOUBLE, BOOL, CHAR, STRING, BYTE, FLOAT.

For values that can be parsed as SECONDS/MILLISECONDS/MICROSECONDS/NANOSECONDS, they must be within the 21 century.

The parsers after STRING are only relevant when a specific column data type is given.
"""
_JParsers = None
_JArrays = None


def _defineSymbols():
Expand All @@ -54,21 +29,16 @@ def _defineSymbols():
if not jpy.has_jvm():
raise SystemError("No java functionality can be used until the JVM has been initialized through the jpy module")

global _JCsvHelpers, _JCsvSpecs, _JInferenceSpecs, _JTableHeader, _JCsvTools, \
INFERENCE_STRINGS, INFERENCE_MINIMAL, INFERENCE_STANDARD, INFERENCE_STANDARD_TIMES
global _JCsvHelpers, _JTableHeader, _JCsvTools, _JParsers, _JArrays

if _JCsvHelpers is None:
# This will raise an exception if the desired object is not the classpath
_JCsvHelpers = jpy.get_type("io.deephaven.csv.CsvTools")
_JCsvSpecs = jpy.get_type("io.deephaven.csv.CsvSpecs")
_JInferenceSpecs = jpy.get_type("io.deephaven.csv.InferenceSpecs")
_JTableHeader = jpy.get_type("io.deephaven.qst.table.TableHeader")
_JCsvTools = jpy.get_type("io.deephaven.csv.CsvTools")
_JParsers = jpy.get_type("io.deephaven.csv.parsers.Parsers")
_JArrays = jpy.get_type("java.util.Arrays")

INFERENCE_STRINGS = _JInferenceSpecs.strings()
INFERENCE_MINIMAL = _JInferenceSpecs.minimal()
INFERENCE_STANDARD = _JInferenceSpecs.standard()
INFERENCE_STANDARD_TIMES = _JInferenceSpecs.standardTimes()

# every module method should be decorated with @_passThrough
@wrapt.decorator
Expand All @@ -87,22 +57,9 @@ def _passThrough(wrapped, instance, args, kwargs):
return wrapped(*args, **kwargs)


@_passThrough
def _build_header(header: Dict[str, DataType] = None):
if not header:
return None

table_header_builder = _JTableHeader.builder()
for k, v in header.items():
table_header_builder.putHeaders(k, v)

return table_header_builder.build()


@_passThrough
def read(path: str,
header: Dict[str, DataType] = None,
inference: Any = None,
header: Dict[str, dht.DataType] = None,
headless: bool = False,
delimiter: str = ",",
quote: str = "\"",
Expand All @@ -114,7 +71,6 @@ def read(path: str,
Args:
path (str): a file path or a URL string
header (Dict[str, DataType]): a dict to define the table columns with key being the name, value being the data type
inference (csv.Inference): an Enum value specifying the rules for data type inference, default is INFERENCE_STANDARD
headless (bool): indicates if the CSV data is headless, default is False
delimiter (str): the delimiter used by the CSV, default is the comma
quote (str): the quote character for the CSV, default is double quote
Expand All @@ -130,17 +86,26 @@ def read(path: str,
Exception
"""

if inference is None:
inference = INFERENCE_STANDARD

csv_specs_builder = _JCsvSpecs.builder()

# build the head spec
table_header = _build_header(header)
if table_header:
csv_specs_builder.header(table_header)

csv_specs = (csv_specs_builder.inference(inference)
csv_specs_builder = _JCsvTools.builder()

if header:
csv_specs_builder.headers(_JArrays.asList(list(header.keys())))
parser_map = {
dht.bool_ : _JParsers.BOOLEAN,
dht.byte : _JParsers.BYTE,
dht.char : _JParsers.CHAR,
dht.short : _JParsers.SHORT,
dht.int_ : _JParsers.INT,
dht.long_ : _JParsers.LONG,
dht.float_ : _JParsers.FLOAT_FAST,
dht.double : _JParsers.DOUBLE,
dht.string : _JParsers.STRING,
dht.datetime : _JParsers.DATETIME
}
for column_name, column_type in header.items():
csv_specs_builder.putParserForName(column_name, parser_map[column_type])

csv_specs = (csv_specs_builder
.hasHeaderRow(not headless)
.delimiter(ord(delimiter))
.quote(ord(quote))
Expand Down
4 changes: 2 additions & 2 deletions extensions/csv/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ description 'CSV: Support to read and write engine tables from/to CSV'

dependencies {
api project(':engine-api')
api 'io.deephaven:deephaven-csv:0.1.0'

implementation project(':engine-table'),
project(':engine-base'),
'ch.randelshofer:fastdoubleparser:0.3.0'
project(':engine-base')

Classpaths.inheritImmutables(project)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package io.deephaven.csv;

import io.deephaven.api.util.NameValidator;
import io.deephaven.csv.CsvSpecs.Builder;

import java.util.function.Function;
import java.util.function.Predicate;
import java.util.regex.Pattern;

/**
* A {@link Builder#headerLegalizer(Function)} that replaces {@code '-'} and {@code ' '} with {@code '_'}. Also
* implements {@link Builder#headerValidator(Predicate)}.
*/
public enum ColumnNameLegalizer implements Function<String[], String[]>, Predicate<String> {
INSTANCE;

private final Pattern pattern;

ColumnNameLegalizer() {
this.pattern = Pattern.compile("[- ]");
}

private String replace(String columnName) {
return pattern.matcher(columnName).replaceAll("_");
}

@Override
public String[] apply(String[] columnNames) {
return NameValidator.legalizeColumnNames(columnNames, this::replace, true);
}


@Override
public boolean test(String columnName) {
return NameValidator.isValidColumnName(columnName);
}
}
Loading