-
Notifications
You must be signed in to change notification settings - Fork 4.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Source Postgres: Handle Arrays data types #16990
Changes from 20 commits
3379208
8e3c18a
7a12915
e45ca3f
8feb061
4ef8894
4cec767
2f9ca39
d588ead
3e418a0
3f0fbd7
afea462
b577bf6
ed915bc
1e74d51
135081a
154e884
2d5e680
80785cc
86c48be
f6f3749
9e0a2a9
7d65724
2a26f7b
b0c2fc5
6d37e38
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,20 +4,38 @@ | |
|
||
package io.airbyte.integrations.debezium.internals; | ||
|
||
import static io.airbyte.db.DataTypeUtils.TIMETZ_FORMATTER; | ||
import static io.airbyte.db.jdbc.DateTimeConverter.convertToDate; | ||
import static io.airbyte.db.jdbc.DateTimeConverter.convertToTime; | ||
import static io.airbyte.db.jdbc.DateTimeConverter.convertToTimestamp; | ||
import static io.airbyte.db.jdbc.DateTimeConverter.convertToTimestampWithTimezone; | ||
import static org.apache.kafka.connect.data.Schema.OPTIONAL_BOOLEAN_SCHEMA; | ||
import static org.apache.kafka.connect.data.Schema.OPTIONAL_FLOAT64_SCHEMA; | ||
import static org.apache.kafka.connect.data.Schema.OPTIONAL_STRING_SCHEMA; | ||
|
||
import io.airbyte.db.jdbc.DateTimeConverter; | ||
import io.debezium.spi.converter.CustomConverter; | ||
import io.debezium.spi.converter.RelationalColumn; | ||
import io.debezium.time.Conversions; | ||
import java.math.BigDecimal; | ||
import java.nio.charset.StandardCharsets; | ||
import java.sql.SQLException; | ||
import java.time.LocalDate; | ||
import java.time.LocalTime; | ||
import java.time.OffsetTime; | ||
import java.time.format.DateTimeFormatter; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.Base64; | ||
import java.util.List; | ||
import java.util.Locale; | ||
import java.util.Objects; | ||
import java.util.Properties; | ||
import java.util.concurrent.TimeUnit; | ||
import java.util.stream.Collectors; | ||
import org.apache.commons.codec.binary.Hex; | ||
import org.apache.kafka.connect.data.SchemaBuilder; | ||
import org.postgresql.jdbc.PgArray; | ||
import org.postgresql.util.PGInterval; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
@@ -33,6 +51,7 @@ public class PostgresConverter implements CustomConverter<SchemaBuilder, Relatio | |
private final String[] TEXT_TYPES = | ||
{"VARCHAR", "VARBINARY", "BLOB", "TEXT", "LONGTEXT", "TINYTEXT", "MEDIUMTEXT", "INVENTORY_ITEM", "TSVECTOR", "TSQUERY", "PG_LSN"}; | ||
private final String[] NUMERIC_TYPES = {"NUMERIC", "DECIMAL"}; | ||
private final String[] ARRAY_TYPES = {"_NAME", "_NUMERIC", "_BYTEA", "_MONEY", "_BIT", "_DATE", "_TIME", "_TIMETZ", "_TIMESTAMP", "_TIMESTAMPTZ"}; | ||
private final String BYTEA_TYPE = "BYTEA"; | ||
|
||
@Override | ||
|
@@ -52,9 +71,22 @@ public void converterFor(final RelationalColumn field, final ConverterRegistrati | |
registerBytea(field, registration); | ||
} else if (Arrays.stream(NUMERIC_TYPES).anyMatch(s -> s.equalsIgnoreCase(field.typeName()))) { | ||
registerNumber(field, registration); | ||
} else if (Arrays.stream(ARRAY_TYPES).anyMatch(s -> s.equalsIgnoreCase(field.typeName()))) { | ||
registerArray(field, registration); | ||
} | ||
} | ||
|
||
private void registerArray(RelationalColumn field, ConverterRegistration<SchemaBuilder> registration) { | ||
final String fieldType = field.typeName().toUpperCase(); | ||
final SchemaBuilder arraySchema = switch (fieldType) { | ||
case "_NUMERIC", "_MONEY" -> SchemaBuilder.array(OPTIONAL_FLOAT64_SCHEMA); | ||
case "_NAME", "_DATE", "_TIME", "_TIMESTAMP", "_TIMESTAMPTZ", "_TIMETZ", "_BYTEA" -> SchemaBuilder.array(OPTIONAL_STRING_SCHEMA); | ||
case "_BIT" -> SchemaBuilder.array(OPTIONAL_BOOLEAN_SCHEMA); | ||
default -> SchemaBuilder.array(OPTIONAL_STRING_SCHEMA); | ||
}; | ||
registration.register(arraySchema, x -> convertArray(x, field)); | ||
} | ||
|
||
private void registerNumber(final RelationalColumn field, final ConverterRegistration<SchemaBuilder> registration) { | ||
registration.register(SchemaBuilder.string().optional(), x -> { | ||
if (x == null) { | ||
|
@@ -106,6 +138,72 @@ private void registerText(final RelationalColumn field, final ConverterRegistrat | |
}); | ||
} | ||
|
||
private Object convertArray(Object x, RelationalColumn field) { | ||
final String fieldType = field.typeName().toUpperCase(); | ||
Object[] values = new Object[0]; | ||
try { | ||
values = (Object[]) ((PgArray) x).getArray(); | ||
} catch (SQLException e) { | ||
LOGGER.error("Failed to convert PgArray:" + e); | ||
} | ||
switch (fieldType) { | ||
// debezium currently cannot handle MONEY[] datatype and it's not implemented | ||
case "_MONEY": | ||
// PgArray.getArray() trying to convert to Double instead of PgMoney | ||
// due to incorrect type mapping in the postgres driver | ||
// https://github.com/pgjdbc/pgjdbc/blob/d5ed52ef391670e83ae5265af2f7301c615ce4ca/pgjdbc/src/main/java/org/postgresql/jdbc/TypeInfoCache.java#L88 | ||
// and throws an exception, so a custom implementation of converting to String is used to get the | ||
// value as is | ||
final String nativeMoneyValue = ((PgArray) x).toString(); | ||
final String substringM = Objects.requireNonNull(nativeMoneyValue).substring(1, nativeMoneyValue.length() - 1); | ||
final char currency = substringM.charAt(0); | ||
final String regex = "\\" + currency; | ||
final List<String> myListM = new ArrayList<>(Arrays.asList(substringM.split(regex))); | ||
return myListM.stream() | ||
// since the separator is the currency sign, all extra characters must be removed except for numbers | ||
// and dots | ||
.map(val -> val.replaceAll("[^\\d.]", "")) | ||
.filter(money -> !money.isEmpty()) | ||
.map(Double::valueOf) | ||
.collect(Collectors.toList()); | ||
case "_NUMERIC": | ||
return Arrays.stream(values).map(value -> value == null ? null : Double.valueOf(value.toString())).collect(Collectors.toList()); | ||
case "_TIME": | ||
return Arrays.stream(values).map(value -> value == null ? null : convertToTime(value)).collect(Collectors.toList()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! I like it when we are able to reuse code |
||
case "_DATE": | ||
return Arrays.stream(values).map(value -> value == null ? null : convertToDate(value)).collect(Collectors.toList()); | ||
case "_TIMESTAMP": | ||
return Arrays.stream(values).map(value -> value == null ? null : convertToTimestamp(value)).collect(Collectors.toList()); | ||
case "_TIMESTAMPTZ": | ||
return Arrays.stream(values).map(value -> value == null ? null : convertToTimestampWithTimezone(value)).collect(Collectors.toList()); | ||
case "_TIMETZ": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why cant we use the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
debezium handles timetz and timetz [] differently There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's add this logic to method
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
@subodh1810 the approach of converting to java.sql.Time is not entirely correct, because in this case we lose information about the time zone. That's why I refused to use it. |
||
|
||
final List<String> timetzArr = new ArrayList<>(); | ||
final String nativeValue = ((PgArray) x).toString(); | ||
final String substring = Objects.requireNonNull(nativeValue).substring(1, nativeValue.length() - 1); | ||
final List<String> times = new ArrayList<>(Arrays.asList(substring.split(","))); | ||
final DateTimeFormatter format = DateTimeFormatter.ofPattern("HH:mm:ss[.SSSSSS]X"); | ||
|
||
times.forEach(s -> { | ||
if (s.equalsIgnoreCase("NULL")) { | ||
timetzArr.add(null); | ||
} else { | ||
final OffsetTime parsed = OffsetTime.parse(s, format); | ||
timetzArr.add(parsed.format(TIMETZ_FORMATTER)); | ||
} | ||
}); | ||
return timetzArr; | ||
case "_BYTEA": | ||
return Arrays.stream(values).map(value -> Base64.getEncoder().encodeToString((byte[]) value)).collect(Collectors.toList()); | ||
case "_BIT": | ||
return Arrays.stream(values).map(value -> (Boolean) value).collect(Collectors.toList()); | ||
case "_NAME": | ||
return Arrays.stream(values).map(value -> (String) value).collect(Collectors.toList()); | ||
default: | ||
return new ArrayList<>(); | ||
} | ||
} | ||
|
||
private int getTimePrecision(final RelationalColumn field) { | ||
return field.scale().orElse(-1); | ||
} | ||
|
@@ -127,30 +225,20 @@ private void registerDate(final RelationalColumn field, final ConverterRegistrat | |
case "TIMESTAMP": | ||
if (x instanceof final Long l) { | ||
if (getTimePrecision(field) <= 3) { | ||
return DateTimeConverter.convertToTimestamp(Conversions.toInstantFromMillis(l)); | ||
return convertToTimestamp(Conversions.toInstantFromMillis(l)); | ||
} | ||
if (getTimePrecision(field) <= 6) { | ||
return DateTimeConverter.convertToTimestamp(Conversions.toInstantFromMicros(l)); | ||
return convertToTimestamp(Conversions.toInstantFromMicros(l)); | ||
} | ||
} | ||
return DateTimeConverter.convertToTimestamp(x); | ||
return convertToTimestamp(x); | ||
case "DATE": | ||
if (x instanceof Integer) { | ||
return DateTimeConverter.convertToDate(LocalDate.ofEpochDay((Integer) x)); | ||
return convertToDate(LocalDate.ofEpochDay((Integer) x)); | ||
} | ||
return DateTimeConverter.convertToDate(x); | ||
return convertToDate(x); | ||
case "TIME": | ||
if (x instanceof Long) { | ||
if (getTimePrecision(field) <= 3) { | ||
long l = Math.multiplyExact((Long) x, TimeUnit.MILLISECONDS.toNanos(1)); | ||
return DateTimeConverter.convertToTime(LocalTime.ofNanoOfDay(l)); | ||
} | ||
if (getTimePrecision(field) <= 6) { | ||
long l = Math.multiplyExact((Long) x, TimeUnit.MICROSECONDS.toNanos(1)); | ||
return DateTimeConverter.convertToTime(LocalTime.ofNanoOfDay(l)); | ||
} | ||
} | ||
return DateTimeConverter.convertToTime(x); | ||
return resolveTime(field, x); | ||
case "INTERVAL": | ||
return convertInterval((PGInterval) x); | ||
default: | ||
|
@@ -159,6 +247,20 @@ private void registerDate(final RelationalColumn field, final ConverterRegistrat | |
}); | ||
} | ||
|
||
private String resolveTime(RelationalColumn field, Object x) { | ||
if (x instanceof Long) { | ||
if (getTimePrecision(field) <= 3) { | ||
long l = Math.multiplyExact((Long) x, TimeUnit.MILLISECONDS.toNanos(1)); | ||
return DateTimeConverter.convertToTime(LocalTime.ofNanoOfDay(l)); | ||
} | ||
if (getTimePrecision(field) <= 6) { | ||
long l = Math.multiplyExact((Long) x, TimeUnit.MICROSECONDS.toNanos(1)); | ||
return DateTimeConverter.convertToTime(LocalTime.ofNanoOfDay(l)); | ||
} | ||
} | ||
return DateTimeConverter.convertToTime(x); | ||
} | ||
|
||
private String convertInterval(final PGInterval pgInterval) { | ||
final StringBuilder resultInterval = new StringBuilder(); | ||
formatDateUnit(resultInterval, pgInterval.getYears(), " year "); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
am not sure I understand why we are doing this replace all, can you add a comment explaining why this is required.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
conversion from pgarray happens incorrectly and the Failed to convert PgArray:org.postgresql.util.PSQLException: Bad value for type double : $999.99" exception is thrown, so for money array I made a tricky conversion
Data in pgarray looks something like this {$999.99,"$1,001.01","$45,000.00",$1.00,$800.00,"$22,222.01","$1,001.01"}
In order to correctly convert them to double, I first split by the currency sign, and then remove all unnecessary characters (commas, quotes, etc.), except for numbers and dots, and as a result we get the correct double array
{999.99,1001.01,45000.00,1.00,800.00,22222.01,1001.01}