deephaven · devinrsmith · Feb 8, 2022 · Feb 2, 2022 · Feb 2, 2022 · Feb 2, 2022
diff --git a/Integrations/python/deephaven/csv.py b/Integrations/python/deephaven/csv.py
@@ -10,38 +10,13 @@
 import jpy
 import wrapt
 
-from deephaven.Types import DataType
+import deephaven.Types as dht
 
 _JCsvHelpers = None
-_JCsvSpecs = None
-_JInferenceSpecs = None
 _JTableHeader = None
 _JCsvTools = None
-
-
-INFERENCE_STRINGS = None
-""" The order of parsing: STRING, INSTANT, SHORT, INT, LONG, DOUBLE, BOOL, CHAR, BYTE, FLOAT. 
-The parsers after STRING are only relevant when a specific column data type is given.
-"""
-
-INFERENCE_MINIMAL = None
-""" The order of parsing: INSTANT, LONG, DOUBLE, BOOL, STRING, BYTE, SHORT, INT, FLOAT, CHAR.
-The parsers after STRING are only relevant when a specific column data type is given.
-"""
-
-INFERENCE_STANDARD = None
-""" The order of parsing: INSTANT, SHORT, INT, LONG, DOUBLE, BOOL, CHAR, STRING, BYTE, FLOAT.
-The parsers after STRING are only relevant when a specific column data type is given.
-"""
-
-INFERENCE_STANDARD_TIMES = None
-""" The order of parsing: INSTANT, INSTANT_LEGACY, SECONDS, MILLISECONDS, MICROSECONDS, NANOSECONDS, SHORT, INT, 
-LONG, DOUBLE, BOOL, CHAR, STRING, BYTE, FLOAT.
-
-For values that can be parsed as SECONDS/MILLISECONDS/MICROSECONDS/NANOSECONDS, they must be within the 21 century.
-
-The parsers after STRING are only relevant when a specific column data type is given.
-"""
+_JParsers = None
+_JArrays = None
 
 
 def _defineSymbols():
@@ -54,21 +29,16 @@ def _defineSymbols():
     if not jpy.has_jvm():
         raise SystemError("No java functionality can be used until the JVM has been initialized through the jpy module")
 
-    global _JCsvHelpers, _JCsvSpecs, _JInferenceSpecs, _JTableHeader, _JCsvTools, \
-        INFERENCE_STRINGS, INFERENCE_MINIMAL, INFERENCE_STANDARD, INFERENCE_STANDARD_TIMES
+    global _JCsvHelpers, _JTableHeader, _JCsvTools, _JParsers, _JArrays
 
     if _JCsvHelpers is None:
         # This will raise an exception if the desired object is not the classpath
         _JCsvHelpers = jpy.get_type("io.deephaven.csv.CsvTools")
-        _JCsvSpecs = jpy.get_type("io.deephaven.csv.CsvSpecs")
-        _JInferenceSpecs = jpy.get_type("io.deephaven.csv.InferenceSpecs")
         _JTableHeader = jpy.get_type("io.deephaven.qst.table.TableHeader")
         _JCsvTools = jpy.get_type("io.deephaven.csv.CsvTools")
+        _JParsers = jpy.get_type("io.deephaven.csv.parsers.Parsers")
+        _JArrays = jpy.get_type("java.util.Arrays")
 
-        INFERENCE_STRINGS = _JInferenceSpecs.strings()
-        INFERENCE_MINIMAL = _JInferenceSpecs.minimal()
-        INFERENCE_STANDARD = _JInferenceSpecs.standard()
-        INFERENCE_STANDARD_TIMES = _JInferenceSpecs.standardTimes()
 
 # every module method should be decorated with @_passThrough
 @wrapt.decorator
@@ -87,22 +57,9 @@ def _passThrough(wrapped, instance, args, kwargs):
     return wrapped(*args, **kwargs)
 
 
-@_passThrough
-def _build_header(header: Dict[str, DataType] = None):
-    if not header:
-        return None
-
-    table_header_builder = _JTableHeader.builder()
-    for k, v in header.items():
-        table_header_builder.putHeaders(k, v)
-
-    return table_header_builder.build()
-
-
 @_passThrough
 def read(path: str,
-         header: Dict[str, DataType] = None,
-         inference: Any = None,
+         header: Dict[str, dht.DataType] = None,
          headless: bool = False,
          delimiter: str = ",",
          quote: str = "\"",
@@ -114,7 +71,6 @@ def read(path: str,
     Args:
         path (str): a file path or a URL string
         header (Dict[str, DataType]): a dict to define the table columns with key being the name, value being the data type
-        inference (csv.Inference): an Enum value specifying the rules for data type inference, default is INFERENCE_STANDARD
         headless (bool): indicates if the CSV data is headless, default is False
         delimiter (str): the delimiter used by the CSV, default is the comma
         quote (str): the quote character for the CSV, default is double quote
@@ -130,17 +86,26 @@ def read(path: str,
         Exception
     """
 
-    if inference is None:
-        inference = INFERENCE_STANDARD
-
-    csv_specs_builder = _JCsvSpecs.builder()
-
-    # build the head spec
-    table_header = _build_header(header)
-    if table_header:
-        csv_specs_builder.header(table_header)
-
-    csv_specs = (csv_specs_builder.inference(inference)
+    csv_specs_builder = _JCsvTools.builder()
+
+    if header:
+        csv_specs_builder.headers(_JArrays.asList(list(header.keys())))
+        parser_map = {
+            dht.bool_ : _JParsers.BOOLEAN,
+            dht.byte : _JParsers.BYTE,
+            dht.char : _JParsers.CHAR,
+            dht.short : _JParsers.SHORT,
+            dht.int_ : _JParsers.INT,
+            dht.long_ : _JParsers.LONG,
+            dht.float_ : _JParsers.FLOAT_FAST,
+            dht.double : _JParsers.DOUBLE,
+            dht.string : _JParsers.STRING,
+            dht.datetime : _JParsers.DATETIME
+        }
+        for column_name, column_type in header.items():
+            csv_specs_builder.putParserForName(column_name, parser_map[column_type])
+
+    csv_specs = (csv_specs_builder
                  .hasHeaderRow(not headless)
                  .delimiter(ord(delimiter))
                  .quote(ord(quote))

diff --git a/extensions/csv/build.gradle b/extensions/csv/build.gradle
@@ -6,10 +6,10 @@ description 'CSV: Support to read and write engine tables from/to CSV'
 
 dependencies {
     api project(':engine-api')
+    api 'io.deephaven:deephaven-csv:0.1.0'
 
     implementation project(':engine-table'),
-            project(':engine-base'),
-            'ch.randelshofer:fastdoubleparser:0.3.0'
+            project(':engine-base')
 
     Classpaths.inheritImmutables(project)
 

diff --git a/extensions/csv/src/main/java/io/deephaven/csv/ColumnNameLegalizer.java b/extensions/csv/src/main/java/io/deephaven/csv/ColumnNameLegalizer.java
@@ -0,0 +1,37 @@
+package io.deephaven.csv;
+
+import io.deephaven.api.util.NameValidator;
+import io.deephaven.csv.CsvSpecs.Builder;
+
+import java.util.function.Function;
+import java.util.function.Predicate;
+import java.util.regex.Pattern;
+
+/**
+ * A {@link Builder#headerLegalizer(Function)} that replaces {@code '-'} and {@code ' '} with {@code '_'}. Also
+ * implements {@link Builder#headerValidator(Predicate)}.
+ */
+public enum ColumnNameLegalizer implements Function<String[], String[]>, Predicate<String> {
+    INSTANCE;
+
+    private final Pattern pattern;
+
+    ColumnNameLegalizer() {
+        this.pattern = Pattern.compile("[- ]");
+    }
+
+    private String replace(String columnName) {
+        return pattern.matcher(columnName).replaceAll("_");
+    }
+
+    @Override
+    public String[] apply(String[] columnNames) {
+        return NameValidator.legalizeColumnNames(columnNames, this::replace, true);
+    }
+
+
+    @Override
+    public boolean test(String columnName) {
+        return NameValidator.isValidColumnName(columnName);
+    }
+}