From 0f47a6c7a14ad02ae9a01c686d4eebfacccea8ce Mon Sep 17 00:00:00 2001 From: Mike Pigott Date: Fri, 8 Feb 2019 23:21:09 +0100 Subject: [PATCH] ARROW-4142: [Java] JDBC Array -> Arrow ListVector --- .../arrow/adapter/jdbc/JdbcToArrow.java | 2 +- .../arrow/adapter/jdbc/JdbcToArrowConfig.java | 97 ++- .../arrow/adapter/jdbc/JdbcToArrowUtils.java | 566 +++++++++++------- .../adapter/jdbc/JdbcToArrowConfigTest.java | 35 +- .../org/apache/arrow/adapter/jdbc/Table.java | 6 +- 5 files changed, 481 insertions(+), 225 deletions(-) diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrow.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrow.java index 79102043a0f83..7f8c8c08d87b1 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrow.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrow.java @@ -184,7 +184,6 @@ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, BaseAllocator all */ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, Calendar calendar) throws SQLException, IOException { Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); - return sqlToArrow(resultSet, new JdbcToArrowConfig(new RootAllocator(Integer.MAX_VALUE), calendar, false)); } @@ -220,6 +219,7 @@ public static VectorSchemaRoot sqlToArrow(ResultSet resultSet, JdbcToArrowConfig throws SQLException, IOException { Preconditions.checkNotNull(resultSet, "JDBC ResultSet object can not be null"); Preconditions.checkNotNull(config, "The configuration cannot be null"); + Preconditions.checkArgument(config.isValid(), "The configuration must be valid"); VectorSchemaRoot root = VectorSchemaRoot.create( JdbcToArrowUtils.jdbcToArrowSchema(resultSet.getMetaData(), config), config.getAllocator()); diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfig.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfig.java index 8f2a8ef54f839..ac35e85fa3e54 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfig.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfig.java @@ -18,6 +18,7 @@ package org.apache.arrow.adapter.jdbc; import java.util.Calendar; +import java.util.Map; import org.apache.arrow.memory.BaseAllocator; @@ -28,16 +29,29 @@ *

* The allocator is used to construct the {@link org.apache.arrow.vector.VectorSchemaRoot}, * and the calendar is used to define the time zone of any {@link org.apahe.arrow.vector.pojo.ArrowType.Timestamp} - * fields that are created during the conversion. + * fields that are created during the conversion. Neither field may be null. *

*

- * Neither field may be null. + * If the includeMetadata flag is set, the Arrow field metadata will contain information + * from the corresponding {@link java.sql.ResultSetMetaData} that was used to create the + * {@link org.apache.arrow.vector.types.pojo.FieldType} of the corresponding + * {@link org.apache.arrow.vector.FieldVector}. + *

+ *

+ * If there are any {@link java.sql.Types#ARRAY} fields in the {@link java.sql.ResultSet}, the corresponding + * {@link JdbcFieldInfo} for the array's contents must be defined here. Unfortunately, the sub-type + * information cannot be retrieved from all JDBC implementations (H2 for example, returns + * {@link java.sql.Types#NULL} for the array sub-type), so it must be configured here. The column index + * or name can be used to map to a {@link JdbcFieldInfo}, and that will be used for the conversion. *

*/ public final class JdbcToArrowConfig { + private Calendar calendar; private BaseAllocator allocator; private boolean includeMetadata; + private Map arraySubTypesByColumnIndex; + private Map arraySubTypesByColumnName; /** * Constructs a new configuration from the provided allocator and calendar. The allocator @@ -48,18 +62,21 @@ public final class JdbcToArrowConfig { * @param calendar The calendar to use when constructing Timestamp fields and reading time-based results. * @param includeMetadata Whether to include JDBC field metadata in the Arrow Schema Field metadata. */ - JdbcToArrowConfig(BaseAllocator allocator, Calendar calendar, boolean includeMetadata) { + public JdbcToArrowConfig(BaseAllocator allocator, Calendar calendar, boolean includeMetadata) { Preconditions.checkNotNull(allocator, "Memory allocator cannot be null"); this.allocator = allocator; this.calendar = calendar; this.includeMetadata = includeMetadata; + this.arraySubTypesByColumnIndex = null; + this.arraySubTypesByColumnName = null; } /** * The calendar to use when defining Arrow Timestamp fields * and retrieving {@link Date}, {@link Time}, or {@link Timestamp} * data types from the {@link ResultSet}, or null if not converting. + * * @return the calendar. */ public Calendar getCalendar() { @@ -82,4 +99,78 @@ public BaseAllocator getAllocator() { public boolean shouldIncludeMetadata() { return includeMetadata; } + + /** + * Sets whether to include JDBC ResultSet field metadata in the Arrow Schema field metadata. + * + * @param includeMetadata Whether to include or exclude JDBC metadata in the Arrow Schema field metadata. + * @return This instance of the JdbcToArrowConfig, for chaining. + */ + public JdbcToArrowConfig setIncludeMetadata(boolean includeMetadata) { + this.includeMetadata = includeMetadata; + return this; + } + + /** + * Returns the array sub-type {@link JdbcFieldInfo} defined for the provided column index. + * + * @param index The {@link java.sql.ResultSetMetaData} column index of an {@link java.sql.Types#ARRAY} type. + * @return The {@link JdbcFieldInfo} for that array's sub-type, or null if not defined. + */ + public JdbcFieldInfo getArraySubTypeByColumnIndex(int index) { + if (arraySubTypesByColumnIndex == null) { + return null; + } else { + return arraySubTypesByColumnIndex.get(index); + } + } + + /** + * Sets the mapping of column-index-to-{@link JdbcFieldInfo} used for columns of type {@link java.sql.Types#ARRAY}. + * + * @param map The mapping. + * @return This instance of the JdbcToArrowConfig, for chaining. + */ + public JdbcToArrowConfig setArraySubTypeByColumnIndexMap(Map map) { + this.arraySubTypesByColumnIndex = map; + return this; + } + + /** + * Returns the array sub-type {@link JdbcFieldInfo} defined for the provided column name. + * + * @param index The {@link java.sql.ResultSetMetaData} column name of an {@link java.sql.Types#ARRAY} type. + * @return The {@link JdbcFieldInfo} for that array's sub-type, or null if not defined. + */ + public JdbcFieldInfo getArraySubTypeByColumnName(String name) { + if (arraySubTypesByColumnName == null) { + return null; + } else { + return arraySubTypesByColumnName.get(name); + } + } + + /** + * Sets the mapping of column-name-to-{@link JdbcFieldInfo} used for columns of type {@link java.sql.Types#ARRAY}. + * + * @param map The mapping. + * @return This instance of the JdbcToArrowConfig, for chaining. + */ + public JdbcToArrowConfig setArraySubTypeByColumnNameMap(Map map) { + this.arraySubTypesByColumnName = map; + return this; + } + + /** + * Whether this configuration is valid. The configuration is valid when: + *
    + *
  • A memory allocator is provided.
  • + *
  • A calendar is provided.
  • + *
+ * + * @return Whether this configuration is valid. + */ + public boolean isValid() { + return (calendar != null) && (allocator != null); + } } diff --git a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java index 833ca8410a969..6310305a88a6f 100644 --- a/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java +++ b/java/adapter/jdbc/src/main/java/org/apache/arrow/adapter/jdbc/JdbcToArrowUtils.java @@ -25,6 +25,7 @@ import java.io.InputStream; import java.math.BigDecimal; import java.nio.charset.StandardCharsets; +import java.sql.Array; import java.sql.Blob; import java.sql.Clob; import java.sql.Date; @@ -59,6 +60,7 @@ import org.apache.arrow.vector.VarBinaryVector; import org.apache.arrow.vector.VarCharVector; import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.holders.NullableBigIntHolder; import org.apache.arrow.vector.holders.NullableBitHolder; import org.apache.arrow.vector.holders.NullableDateMilliHolder; @@ -95,6 +97,13 @@ public class JdbcToArrowUtils { private static final int DEFAULT_STREAM_BUFFER_SIZE = 1024; private static final int DEFAULT_CLOB_SUBSTRING_READ_SIZE = 256; + /** + * Returns the instance of a {java.util.Calendar} with the UTC time zone and root locale. + */ + public static Calendar getUtcCalendar() { + return Calendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT); + } + /** * Create Arrow {@link Schema} object for the given JDBC {@link ResultSetMetaData}. * @@ -111,67 +120,47 @@ public static Schema jdbcToArrowSchema(ResultSetMetaData rsmd, Calendar calendar } /** - * Returns the instance of a {java.util.Calendar} with the UTC time zone and root locale. - */ - public static Calendar getUtcCalendar() { - return Calendar.getInstance(TimeZone.getTimeZone("UTC"), Locale.ROOT); - } - - /** - * Create Arrow {@link Schema} object for the given JDBC {@link ResultSetMetaData}. - * - *

This method currently performs following type mapping for JDBC SQL data types to corresponding Arrow data types. - * - *

CHAR --> ArrowType.Utf8 - * NCHAR --> ArrowType.Utf8 - * VARCHAR --> ArrowType.Utf8 - * NVARCHAR --> ArrowType.Utf8 - * LONGVARCHAR --> ArrowType.Utf8 - * LONGNVARCHAR --> ArrowType.Utf8 - * NUMERIC --> ArrowType.Decimal(precision, scale) - * DECIMAL --> ArrowType.Decimal(precision, scale) - * BIT --> ArrowType.Bool - * TINYINT --> ArrowType.Int(8, signed) - * SMALLINT --> ArrowType.Int(16, signed) - * INTEGER --> ArrowType.Int(32, signed) - * BIGINT --> ArrowType.Int(64, signed) - * REAL --> ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE) - * FLOAT --> ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE) - * DOUBLE --> ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE) - * BINARY --> ArrowType.Binary - * VARBINARY --> ArrowType.Binary - * LONGVARBINARY --> ArrowType.Binary - * DATE --> ArrowType.Date(DateUnit.MILLISECOND) - * TIME --> ArrowType.Time(TimeUnit.MILLISECOND, 32) - * TIMESTAMP --> ArrowType.Timestamp(TimeUnit.MILLISECOND, timezone=null) - * CLOB --> ArrowType.Utf8 - * BLOB --> ArrowType.Binary + * Create Arrow {@link Schema} object for the given JDBC {@link java.sql.ResultSetMetaData}. + *

+ * The {@link JdbcToArrowUtils#getArrowTypeForJdbcField(JdbcFieldInfo, Calendar)} method is used to construct a + * {@link org.apache.arrow.vector.types.pojo.ArrowType} for each field in the {@link java.sql.ResultSetMetaData}. + *

+ *

+ * If {@link JdbcToArrowConfig#getIncludeMetadata()} returns true, the following fields + * will be added to the {@link FieldType#getMetadata()}: + *

    + *
  • {@link Constants#SQL_CATALOG_NAME_KEY} representing {@link ResultSetMetaData#getCatalogName(int)}
  • + *
  • {@link Constants#SQL_TABLE_NAME_KEY} representing {@link ResultSetMetaData#getTableName(int)}
  • + *
  • {@link Constants#SQL_COLUMN_NAME_KEY} representing {@link ResultSetMetaData#getColumnName(int)}
  • + *
  • {@link Constants#SQL_TYPE_KEY} representing {@link ResultSetMetaData#getColumnTypeName(int)}
  • + *
+ *

+ *

+ * If any columns are of type {@link java.sql.Types#ARRAY}, the configuration object will be used to look up + * the array sub-type field. The {@link JdbcToArrowConfig#getArraySubTypeByColumnIndex(int)} method will be + * checked first, followed by the {@link JdbcToArrowConfig#getArraySubTypeByColumnName(String)} method. + *

* * @param rsmd The ResultSetMetaData containing the results, to read the JDBC metadata from. * @param config The configuration to use when constructing the schema. * @return {@link Schema} * @throws SQLException on error + * @throws IllegalArgumentException if rsmd contains an {@link java.sql.Types#ARRAY} but the + * config does not have a sub-type definition for it. */ public static Schema jdbcToArrowSchema(ResultSetMetaData rsmd, JdbcToArrowConfig config) throws SQLException { Preconditions.checkNotNull(rsmd, "JDBC ResultSetMetaData object can't be null"); Preconditions.checkNotNull(config, "The configuration object must not be null"); - - final String timezone; - if (config.getCalendar() != null) { - timezone = config.getCalendar().getTimeZone().getID(); - } else { - timezone = null; - } + Preconditions.checkArgument(config.isValid(), "The configuration object must be valid"); List fields = new ArrayList<>(); int columnCount = rsmd.getColumnCount(); for (int i = 1; i <= columnCount; i++) { final String columnName = rsmd.getColumnName(i); - final FieldType fieldType; final Map metadata; if (config.shouldIncludeMetadata()) { - metadata = new HashMap<>(); + metadata = new HashMap(); metadata.put(Constants.SQL_CATALOG_NAME_KEY, rsmd.getCatalogName(i)); metadata.put(Constants.SQL_TABLE_NAME_KEY, rsmd.getTableName(i)); metadata.put(Constants.SQL_COLUMN_NAME_KEY, columnName); @@ -181,83 +170,173 @@ public static Schema jdbcToArrowSchema(ResultSetMetaData rsmd, JdbcToArrowConfig metadata = null; } - switch (rsmd.getColumnType(i)) { - case Types.BOOLEAN: - case Types.BIT: - fieldType = new FieldType(true, new ArrowType.Bool(), null, metadata); - break; - case Types.TINYINT: - fieldType = new FieldType(true, new ArrowType.Int(8, true), null, metadata); - break; - case Types.SMALLINT: - fieldType = new FieldType(true, new ArrowType.Int(16, true), null, metadata); - break; - case Types.INTEGER: - fieldType = new FieldType(true, new ArrowType.Int(32, true), null, metadata); - break; - case Types.BIGINT: - fieldType = new FieldType(true, new ArrowType.Int(64, true), null, metadata); - break; - case Types.NUMERIC: - case Types.DECIMAL: - int precision = rsmd.getPrecision(i); - int scale = rsmd.getScale(i); - fieldType = new FieldType(true, new ArrowType.Decimal(precision, scale), null, metadata); - break; - case Types.REAL: - case Types.FLOAT: - fieldType = new FieldType(true, new ArrowType.FloatingPoint(SINGLE), null, metadata); - break; - case Types.DOUBLE: - fieldType = new FieldType(true, new ArrowType.FloatingPoint(DOUBLE), null, metadata); - break; - case Types.CHAR: - case Types.NCHAR: - case Types.VARCHAR: - case Types.NVARCHAR: - case Types.LONGVARCHAR: - case Types.LONGNVARCHAR: - case Types.CLOB: - fieldType = new FieldType(true, new ArrowType.Utf8(), null, metadata); - break; - case Types.DATE: - fieldType = new FieldType(true, new ArrowType.Date(DateUnit.MILLISECOND), null, metadata); - break; - case Types.TIME: - fieldType = new FieldType(true, new ArrowType.Time(TimeUnit.MILLISECOND, 32), null, metadata); - break; - case Types.TIMESTAMP: - fieldType = - new FieldType( - true, - new ArrowType.Timestamp(TimeUnit.MILLISECOND, timezone), - null, - metadata); - break; - case Types.BINARY: - case Types.VARBINARY: - case Types.LONGVARBINARY: - case Types.BLOB: - fieldType = new FieldType(true, new ArrowType.Binary(), null, metadata); - break; - - case Types.ARRAY: - // TODO Need to handle this type - // fields.add(new Field("list", FieldType.nullable(new ArrowType.List()), null)); - default: - // no-op, shouldn't get here - fieldType = null; - break; - } + final ArrowType arrowType = getArrowTypeForJdbcField(new JdbcFieldInfo(rsmd, i), config.getCalendar()); + if (arrowType != null) { + final FieldType fieldType = new FieldType(true, arrowType, null, metadata); + + List children = null; + if (arrowType.getTypeID() == ArrowType.List.TYPE_TYPE) { + final JdbcFieldInfo arrayFieldInfo = getJdbcFieldInfoForArraySubType(rsmd, i, config); + if (arrayFieldInfo == null) { + throw new IllegalArgumentException("Configuration does not provide a mapping for array column " + i); + } + children = new ArrayList(); + final ArrowType childType = + getArrowTypeForJdbcField(arrayFieldInfo, config.getCalendar()); + children.add(new Field("child", FieldType.nullable(childType), null)); + } - if (fieldType != null) { - fields.add(new Field(columnName, fieldType, null)); + fields.add(new Field(columnName, fieldType, children)); } } return new Schema(fields, null); } + /** + * Creates an {@link org.apache.arrow.vector.types.pojo.ArrowType} + * from the {@link JdbcFieldInfo} and {@link java.util.Calendar}. + * + *

This method currently performs following type mapping for JDBC SQL data types to corresponding Arrow data types. + * + *

    + *
  • CHAR --> ArrowType.Utf8
  • + *
  • NCHAR --> ArrowType.Utf8
  • + *
  • VARCHAR --> ArrowType.Utf8
  • + *
  • NVARCHAR --> ArrowType.Utf8
  • + *
  • LONGVARCHAR --> ArrowType.Utf8
  • + *
  • LONGNVARCHAR --> ArrowType.Utf8
  • + *
  • NUMERIC --> ArrowType.Decimal(precision, scale)
  • + *
  • DECIMAL --> ArrowType.Decimal(precision, scale)
  • + *
  • BIT --> ArrowType.Bool
  • + *
  • TINYINT --> ArrowType.Int(8, signed)
  • + *
  • SMALLINT --> ArrowType.Int(16, signed)
  • + *
  • INTEGER --> ArrowType.Int(32, signed)
  • + *
  • BIGINT --> ArrowType.Int(64, signed)
  • + *
  • REAL --> ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
  • + *
  • FLOAT --> ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE)
  • + *
  • DOUBLE --> ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE)
  • + *
  • BINARY --> ArrowType.Binary
  • + *
  • VARBINARY --> ArrowType.Binary
  • + *
  • LONGVARBINARY --> ArrowType.Binary
  • + *
  • DATE --> ArrowType.Date(DateUnit.MILLISECOND)
  • + *
  • TIME --> ArrowType.Time(TimeUnit.MILLISECOND, 32)
  • + *
  • TIMESTAMP --> ArrowType.Timestamp(TimeUnit.MILLISECOND, calendar timezone)
  • + *
  • CLOB --> ArrowType.Utf8
  • + *
  • BLOB --> ArrowType.Binary
  • + *
+ * + * @param fieldInfo The field information to construct the ArrowType from. + * @param calendar The calendar to use when constructing the ArrowType.Timestamp + * for {@link java.sql.Types#TIMESTAMP} types. + * @return The corresponding ArrowType. + * @throws NullPointerException if either fieldInfo or calendar are null. + */ + public static ArrowType getArrowTypeForJdbcField(JdbcFieldInfo fieldInfo, Calendar calendar) { + Preconditions.checkNotNull(fieldInfo, "JdbcFieldInfo object cannot be null"); + + final String timezone; + if (calendar != null) { + timezone = calendar.getTimeZone().getID(); + } else { + timezone = null; + } + + + final ArrowType arrowType; + + switch (fieldInfo.getJdbcType()) { + case Types.BOOLEAN: + case Types.BIT: + arrowType = new ArrowType.Bool(); + break; + case Types.TINYINT: + arrowType = new ArrowType.Int(8, true); + break; + case Types.SMALLINT: + arrowType = new ArrowType.Int(16, true); + break; + case Types.INTEGER: + arrowType = new ArrowType.Int(32, true); + break; + case Types.BIGINT: + arrowType = new ArrowType.Int(64, true); + break; + case Types.NUMERIC: + case Types.DECIMAL: + int precision = fieldInfo.getPrecision(); + int scale = fieldInfo.getScale(); + arrowType = new ArrowType.Decimal(precision, scale); + break; + case Types.REAL: + case Types.FLOAT: + arrowType = new ArrowType.FloatingPoint(SINGLE); + break; + case Types.DOUBLE: + arrowType = new ArrowType.FloatingPoint(DOUBLE); + break; + case Types.CHAR: + case Types.NCHAR: + case Types.VARCHAR: + case Types.NVARCHAR: + case Types.LONGVARCHAR: + case Types.LONGNVARCHAR: + case Types.CLOB: + arrowType = new ArrowType.Utf8(); + break; + case Types.DATE: + arrowType = new ArrowType.Date(DateUnit.MILLISECOND); + break; + case Types.TIME: + arrowType = new ArrowType.Time(TimeUnit.MILLISECOND, 32); + break; + case Types.TIMESTAMP: + arrowType = new ArrowType.Timestamp(TimeUnit.MILLISECOND, timezone); + break; + case Types.BINARY: + case Types.VARBINARY: + case Types.LONGVARBINARY: + case Types.BLOB: + arrowType = new ArrowType.Binary(); + break; + case Types.ARRAY: + arrowType = new ArrowType.List(); + break; + default: + // no-op, shouldn't get here + arrowType = null; + break; + } + + return arrowType; + } + + /* Uses the configuration to determine what the array sub-type JdbcFieldInfo is. + * If no sub-type can be found, returns null. + */ + private static JdbcFieldInfo getJdbcFieldInfoForArraySubType( + ResultSetMetaData rsmd, + int arrayColumn, + JdbcToArrowConfig config) + throws SQLException { + + Preconditions.checkNotNull(rsmd, "ResultSet MetaData object cannot be null"); + Preconditions.checkNotNull(config, "Configuration must not be null"); + Preconditions.checkArgument( + arrayColumn > 0, + "ResultSetMetaData columns start with 1; column cannot be less than 1"); + Preconditions.checkArgument( + arrayColumn <= rsmd.getColumnCount(), + "Column number cannot be more than the number of columns"); + Preconditions.checkArgument(config.isValid(), "Configuration must be valid"); + + JdbcFieldInfo fieldInfo = config.getArraySubTypeByColumnIndex(arrayColumn); + if (fieldInfo == null) { + fieldInfo = config.getArraySubTypeByColumnName(rsmd.getColumnName(arrayColumn)); + } + return fieldInfo; + } + private static void allocateVectors(VectorSchemaRoot root, int size) { List vectors = root.getFieldVectors(); for (FieldVector fieldVector : vectors) { @@ -284,7 +363,8 @@ public static void jdbcToArrowVectors(ResultSet rs, VectorSchemaRoot root, Calen throws SQLException, IOException { Preconditions.checkNotNull(rs, "JDBC ResultSet object can't be null"); - Preconditions.checkNotNull(root, "JDBC ResultSet object can't be null"); + Preconditions.checkNotNull(root, "Vector Schema cannot be null"); + Preconditions.checkNotNull(calendar, "Calendar object can't be null"); jdbcToArrowVectors(rs, root, new JdbcToArrowConfig(new RootAllocator(0), calendar, false)); } @@ -304,123 +384,140 @@ public static void jdbcToArrowVectors(ResultSet rs, VectorSchemaRoot root, JdbcT Preconditions.checkNotNull(rs, "JDBC ResultSet object can't be null"); Preconditions.checkNotNull(root, "JDBC ResultSet object can't be null"); Preconditions.checkNotNull(config, "JDBC-to-Arrow configuration cannot be null"); + Preconditions.checkArgument(config.isValid(), "JDBC-to-Arrow configuration must be valid"); ResultSetMetaData rsmd = rs.getMetaData(); int columnCount = rsmd.getColumnCount(); allocateVectors(root, DEFAULT_BUFFER_SIZE); - final Calendar calendar = config.getCalendar(); - int rowCount = 0; while (rs.next()) { for (int i = 1; i <= columnCount; i++) { - String columnName = rsmd.getColumnName(i); - switch (rsmd.getColumnType(i)) { - case Types.BOOLEAN: - case Types.BIT: - updateVector((BitVector) root.getVector(columnName), - rs.getBoolean(i), !rs.wasNull(), rowCount); - break; - case Types.TINYINT: - updateVector((TinyIntVector) root.getVector(columnName), - rs.getInt(i), !rs.wasNull(), rowCount); - break; - case Types.SMALLINT: - updateVector((SmallIntVector) root.getVector(columnName), - rs.getInt(i), !rs.wasNull(), rowCount); - break; - case Types.INTEGER: - updateVector((IntVector) root.getVector(columnName), - rs.getInt(i), !rs.wasNull(), rowCount); - break; - case Types.BIGINT: - updateVector((BigIntVector) root.getVector(columnName), - rs.getLong(i), !rs.wasNull(), rowCount); - break; - case Types.NUMERIC: - case Types.DECIMAL: - updateVector((DecimalVector) root.getVector(columnName), - rs.getBigDecimal(i), !rs.wasNull(), rowCount); - break; - case Types.REAL: - case Types.FLOAT: - updateVector((Float4Vector) root.getVector(columnName), - rs.getFloat(i), !rs.wasNull(), rowCount); - break; - case Types.DOUBLE: - updateVector((Float8Vector) root.getVector(columnName), - rs.getDouble(i), !rs.wasNull(), rowCount); - break; - case Types.CHAR: - case Types.NCHAR: - case Types.VARCHAR: - case Types.NVARCHAR: - case Types.LONGVARCHAR: - case Types.LONGNVARCHAR: - updateVector((VarCharVector) root.getVector(columnName), - rs.getString(i), !rs.wasNull(), rowCount); - break; - case Types.DATE: - final Date date; - if (calendar != null) { - date = rs.getDate(i, calendar); - } else { - date = rs.getDate(i); - } - - updateVector((DateMilliVector) root.getVector(columnName), date, !rs.wasNull(), rowCount); - break; - case Types.TIME: - final Time time; - if (calendar != null) { - time = rs.getTime(i, calendar); - } else { - time = rs.getTime(i); - } - - updateVector((TimeMilliVector) root.getVector(columnName), time, !rs.wasNull(), rowCount); - break; - case Types.TIMESTAMP: - final Timestamp ts; - if (calendar != null) { - ts = rs.getTimestamp(i, calendar); - } else { - ts = rs.getTimestamp(i); - } - - // TODO: Need to handle precision such as milli, micro, nano - updateVector((TimeStampVector) root.getVector(columnName), ts, !rs.wasNull(), rowCount); - break; - case Types.BINARY: - case Types.VARBINARY: - case Types.LONGVARBINARY: - updateVector((VarBinaryVector) root.getVector(columnName), - rs.getBinaryStream(i), !rs.wasNull(), rowCount); - break; - case Types.ARRAY: - // TODO Need to handle this type - // fields.add(new Field("list", FieldType.nullable(new ArrowType.List()), null)); - break; - case Types.CLOB: - updateVector((VarCharVector) root.getVector(columnName), - rs.getClob(i), !rs.wasNull(), rowCount); - break; - case Types.BLOB: - updateVector((VarBinaryVector) root.getVector(columnName), - rs.getBlob(i), !rs.wasNull(), rowCount); - break; - - default: - // no-op, shouldn't get here - break; - } + jdbcToFieldVector( + rs, + i, + rs.getMetaData().getColumnType(i), + rowCount, + root.getVector(rsmd.getColumnName(i)), + config); } rowCount++; } root.setRowCount(rowCount); } + private static void jdbcToFieldVector( + ResultSet rs, + int i, + int jdbcColType, + int rowCount, + FieldVector vector, + JdbcToArrowConfig config) + throws SQLException, IOException { + + final Calendar calendar = config.getCalendar(); + + switch (jdbcColType) { + case Types.BOOLEAN: + case Types.BIT: + updateVector((BitVector) vector, + rs.getBoolean(i), !rs.wasNull(), rowCount); + break; + case Types.TINYINT: + updateVector((TinyIntVector) vector, + rs.getInt(i), !rs.wasNull(), rowCount); + break; + case Types.SMALLINT: + updateVector((SmallIntVector) vector, + rs.getInt(i), !rs.wasNull(), rowCount); + break; + case Types.INTEGER: + updateVector((IntVector) vector, + rs.getInt(i), !rs.wasNull(), rowCount); + break; + case Types.BIGINT: + updateVector((BigIntVector) vector, + rs.getLong(i), !rs.wasNull(), rowCount); + break; + case Types.NUMERIC: + case Types.DECIMAL: + updateVector((DecimalVector) vector, + rs.getBigDecimal(i), !rs.wasNull(), rowCount); + break; + case Types.REAL: + case Types.FLOAT: + updateVector((Float4Vector) vector, + rs.getFloat(i), !rs.wasNull(), rowCount); + break; + case Types.DOUBLE: + updateVector((Float8Vector) vector, + rs.getDouble(i), !rs.wasNull(), rowCount); + break; + case Types.CHAR: + case Types.NCHAR: + case Types.VARCHAR: + case Types.NVARCHAR: + case Types.LONGVARCHAR: + case Types.LONGNVARCHAR: + updateVector((VarCharVector) vector, + rs.getString(i), !rs.wasNull(), rowCount); + break; + case Types.DATE: + final Date date; + if (calendar != null) { + date = rs.getDate(i, calendar); + } else { + date = rs.getDate(i); + } + + updateVector((DateMilliVector) vector, date, !rs.wasNull(), rowCount); + break; + case Types.TIME: + final Time time; + if (calendar != null) { + time = rs.getTime(i, calendar); + } else { + time = rs.getTime(i); + } + + updateVector((TimeMilliVector) vector, time, !rs.wasNull(), rowCount); + break; + case Types.TIMESTAMP: + final Timestamp ts; + if (calendar != null) { + ts = rs.getTimestamp(i, calendar); + } else { + ts = rs.getTimestamp(i); + } + + // TODO: Need to handle precision such as milli, micro, nano + updateVector((TimeStampVector) vector, ts, !rs.wasNull(), rowCount); + break; + case Types.BINARY: + case Types.VARBINARY: + case Types.LONGVARBINARY: + updateVector((VarBinaryVector) vector, + rs.getBinaryStream(i), !rs.wasNull(), rowCount); + break; + case Types.ARRAY: + updateVector((ListVector) vector, rs, i, rowCount, config); + break; + case Types.CLOB: + updateVector((VarCharVector) vector, + rs.getClob(i), !rs.wasNull(), rowCount); + break; + case Types.BLOB: + updateVector((VarBinaryVector) vector, + rs.getBlob(i), !rs.wasNull(), rowCount); + break; + + default: + // no-op, shouldn't get here + break; + } + } + private static void updateVector(BitVector bitVector, boolean value, boolean isNonNull, int rowCount) { NullableBitHolder holder = new NullableBitHolder(); holder.isSet = isNonNull ? 1 : 0; @@ -620,4 +717,39 @@ private static void updateVector(VarBinaryVector varBinaryVector, Blob blob, boo updateVector(varBinaryVector, blob != null ? blob.getBinaryStream() : null, isNonNull, rowCount); } + private static void updateVector( + ListVector listVector, + ResultSet resultSet, + int arrayIndex, + int rowCount, + JdbcToArrowConfig config) + throws SQLException, IOException { + + final JdbcFieldInfo fieldInfo = getJdbcFieldInfoForArraySubType(resultSet.getMetaData(), arrayIndex, config); + if (fieldInfo == null) { + throw new IllegalStateException("Column " + arrayIndex + " is an array of unknown type."); + } + + final int valueCount = listVector.getValueCount(); + final Array array = resultSet.getArray(arrayIndex); + + FieldVector fieldVector = listVector.getDataVector(); + int arrayRowCount = 0; + + listVector.startNewValue(rowCount); + + if (!resultSet.wasNull()) { + try (ResultSet rs = array.getResultSet()) { + + while (rs.next()) { + // The second column contains the actual data. + jdbcToFieldVector(rs, 2, fieldInfo.getJdbcType(), valueCount + arrayRowCount, fieldVector, config); + arrayRowCount++; + } + } + } + + listVector.endValue(rowCount, arrayRowCount); + listVector.setValueCount(valueCount + arrayRowCount); + } } diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigTest.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigTest.java index bafb2dcdcc341..d2d4b24686cf0 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigTest.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/JdbcToArrowConfigTest.java @@ -19,7 +19,9 @@ import static org.junit.Assert.*; +import java.sql.Types; import java.util.Calendar; +import java.util.HashMap; import java.util.Locale; import java.util.TimeZone; @@ -95,7 +97,8 @@ public void testConfig() { assertTrue(newCalendar == config.getCalendar()); } - @Test public void testIncludeMetadata() { + @Test + public void testIncludeMetadata() { JdbcToArrowConfigBuilder builder = new JdbcToArrowConfigBuilder(allocator, calendar, false); JdbcToArrowConfig config = builder.build(); @@ -114,4 +117,34 @@ public void testConfig() { config = new JdbcToArrowConfig(allocator, calendar, false); assertFalse(config.shouldIncludeMetadata()); } + + @Test + public void testArraySubTypes() { + JdbcToArrowConfig config = new JdbcToArrowConfig(allocator, calendar, false); + + final int columnIndex = 1; + final String columnName = "COLUMN"; + + assertNull(config.getArraySubTypeByColumnIndex(columnIndex)); + assertNull(config.getArraySubTypeByColumnName(columnName)); + + final HashMap indexMapping = new HashMap(); + indexMapping.put(2, new JdbcFieldInfo(Types.BIGINT)); + + final HashMap fieldMapping = new HashMap(); + fieldMapping.put("NEW_COLUMN", new JdbcFieldInfo(Types.BINARY)); + + config.setArraySubTypeByColumnIndexMap(indexMapping); + config.setArraySubTypeByColumnNameMap(fieldMapping); + + assertNull(config.getArraySubTypeByColumnIndex(columnIndex)); + assertNull(config.getArraySubTypeByColumnName(columnName)); + + indexMapping.put(columnIndex, new JdbcFieldInfo(Types.BIT)); + fieldMapping.put(columnName, new JdbcFieldInfo(Types.BLOB)); + + assertNotNull(config.getArraySubTypeByColumnIndex(columnIndex)); + assertEquals(Types.BIT, config.getArraySubTypeByColumnIndex(columnIndex).getJdbcType()); + assertEquals(Types.BLOB, config.getArraySubTypeByColumnName(columnName).getJdbcType()); + } } diff --git a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/Table.java b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/Table.java index 98f799c1b5bf6..2137162667ef1 100644 --- a/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/Table.java +++ b/java/adapter/jdbc/src/test/java/org/apache/arrow/adapter/jdbc/Table.java @@ -204,7 +204,7 @@ public void setRowCount(int rowCount) { this.rowCount = rowCount; } - private byte[][] getByteArray(String[] data) { + static byte[][] getByteArray(String[] data) { byte[][] byteArr = new byte[data.length][]; for (int i = 0; i < data.length; i++) { @@ -213,7 +213,7 @@ private byte[][] getByteArray(String[] data) { return byteArr; } - private byte[][] getHexToByteArray(String[] data) { + static byte[][] getHexToByteArray(String[] data) { byte[][] byteArr = new byte[data.length][]; for (int i = 0; i < data.length; i++) { @@ -222,7 +222,7 @@ private byte[][] getHexToByteArray(String[] data) { return byteArr; } - private static byte[] hexStringToByteArray(String s) { + static byte[] hexStringToByteArray(String s) { int len = s.length(); byte[] data = new byte[len / 2]; for (int i = 0; i < len; i += 2) {