-
Notifications
You must be signed in to change notification settings - Fork 4.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Destination redshift: async standard inserts #32888
Changes from 25 commits
2a59e85
9fe0da3
00dc110
57a9fd4
b87b935
b7640d6
951e8eb
3021982
89e2b47
617a75a
1759186
cf3171f
a6e37df
b73a47f
949be58
6b74f57
0067616
7128786
5f70fd0
422af2e
90fb108
b42a93e
3d1ff9d
2051da9
1d2bca9
f4daa6a
b9b246f
95c21d0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,31 +4,35 @@ | |
|
||
package io.airbyte.cdk.integrations.destination.jdbc; | ||
|
||
import static io.airbyte.cdk.integrations.destination.jdbc.constants.GlobalDataSizeConstants.DEFAULT_MAX_BATCH_SIZE_BYTES; | ||
|
||
import com.fasterxml.jackson.databind.JsonNode; | ||
import com.google.common.base.Preconditions; | ||
import io.airbyte.cdk.db.jdbc.JdbcDatabase; | ||
import io.airbyte.cdk.db.jdbc.JdbcUtils; | ||
import io.airbyte.cdk.integrations.base.AirbyteMessageConsumer; | ||
import io.airbyte.cdk.integrations.base.SerializedAirbyteMessageConsumer; | ||
import io.airbyte.cdk.integrations.destination.NamingConventionTransformer; | ||
import io.airbyte.cdk.integrations.destination.buffered_stream_consumer.BufferedStreamConsumer; | ||
import io.airbyte.cdk.integrations.destination.buffered_stream_consumer.OnCloseFunction; | ||
import io.airbyte.cdk.integrations.destination.buffered_stream_consumer.OnStartFunction; | ||
import io.airbyte.cdk.integrations.destination.buffered_stream_consumer.RecordWriter; | ||
import io.airbyte.cdk.integrations.destination.record_buffer.InMemoryRecordBufferingStrategy; | ||
import io.airbyte.cdk.integrations.destination_async.AsyncStreamConsumer; | ||
import io.airbyte.cdk.integrations.destination_async.OnCloseFunction; | ||
import io.airbyte.cdk.integrations.destination_async.buffers.BufferManager; | ||
import io.airbyte.cdk.integrations.destination_async.partial_messages.PartialAirbyteMessage; | ||
import io.airbyte.commons.json.Jsons; | ||
import io.airbyte.integrations.base.destination.typing_deduping.ParsedCatalog; | ||
import io.airbyte.integrations.base.destination.typing_deduping.TypeAndDedupeOperationValve; | ||
import io.airbyte.integrations.base.destination.typing_deduping.TyperDeduper; | ||
import io.airbyte.protocol.models.v0.AirbyteMessage; | ||
import io.airbyte.protocol.models.v0.AirbyteRecordMessage; | ||
import io.airbyte.protocol.models.v0.AirbyteStream; | ||
import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; | ||
import io.airbyte.protocol.models.v0.ConfiguredAirbyteCatalog; | ||
import io.airbyte.protocol.models.v0.ConfiguredAirbyteStream; | ||
import io.airbyte.protocol.models.v0.DestinationSyncMode; | ||
import java.time.Instant; | ||
import java.util.ArrayList; | ||
import java.util.Collection; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.concurrent.Executors; | ||
import java.util.function.Consumer; | ||
import java.util.function.Function; | ||
import java.util.stream.Collectors; | ||
|
@@ -53,21 +57,30 @@ public class JdbcBufferedConsumerFactory { | |
|
||
private static final Logger LOGGER = LoggerFactory.getLogger(JdbcBufferedConsumerFactory.class); | ||
|
||
public static AirbyteMessageConsumer create(final Consumer<AirbyteMessage> outputRecordCollector, | ||
final JdbcDatabase database, | ||
final SqlOperations sqlOperations, | ||
final NamingConventionTransformer namingResolver, | ||
final JsonNode config, | ||
final ConfiguredAirbyteCatalog catalog) { | ||
public static SerializedAirbyteMessageConsumer createAsync(final Consumer<AirbyteMessage> outputRecordCollector, | ||
final JdbcDatabase database, | ||
final SqlOperations sqlOperations, | ||
final NamingConventionTransformer namingResolver, | ||
final JsonNode config, | ||
final ConfiguredAirbyteCatalog catalog, | ||
final String defaultNamespace, | ||
final boolean use1s1t, | ||
// TODO these are currently unused, but we'll need it for DV2 things | ||
// specifically creating the dv2 raw tables instead of legacy tables | ||
final ParsedCatalog parsedCatalog, | ||
final TyperDeduper typerDeduper, | ||
// TODO this is only needed if we want to do incremental T+D | ||
final TypeAndDedupeOperationValve typerDeduperValve) { | ||
final List<WriteConfig> writeConfigs = createWriteConfigs(namingResolver, config, catalog, sqlOperations.isSchemaRequired()); | ||
|
||
return new BufferedStreamConsumer( | ||
return new AsyncStreamConsumer( | ||
outputRecordCollector, | ||
onStartFunction(database, sqlOperations, writeConfigs), | ||
new InMemoryRecordBufferingStrategy(recordWriterFunction(database, sqlOperations, writeConfigs, catalog), DEFAULT_MAX_BATCH_SIZE_BYTES), | ||
onCloseFunction(), | ||
onCloseFunction(use1s1t, typerDeduper), | ||
new JdbcInsertFlushFunction(recordWriterFunction(database, sqlOperations, writeConfigs, catalog)), | ||
catalog, | ||
sqlOperations::isValidData); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. every implementation of this method is just |
||
new BufferManager((long) (Runtime.getRuntime().maxMemory() * 0.2)), | ||
defaultNamespace, | ||
Executors.newFixedThreadPool(2)); | ||
} | ||
|
||
private static List<WriteConfig> createWriteConfigs(final NamingConventionTransformer namingResolver, | ||
|
@@ -77,14 +90,12 @@ private static List<WriteConfig> createWriteConfigs(final NamingConventionTransf | |
if (schemaRequired) { | ||
Preconditions.checkState(config.has("schema"), "jdbc destinations must specify a schema."); | ||
} | ||
final Instant now = Instant.now(); | ||
return catalog.getStreams().stream().map(toWriteConfig(namingResolver, config, now, schemaRequired)).collect(Collectors.toList()); | ||
return catalog.getStreams().stream().map(toWriteConfig(namingResolver, config, schemaRequired)).collect(Collectors.toList()); | ||
} | ||
|
||
private static Function<ConfiguredAirbyteStream, WriteConfig> toWriteConfig( | ||
final NamingConventionTransformer namingResolver, | ||
final JsonNode config, | ||
final Instant now, | ||
final boolean schemaRequired) { | ||
return stream -> { | ||
Preconditions.checkNotNull(stream.getDestinationSyncMode(), "Undefined destination sync mode"); | ||
|
@@ -133,11 +144,10 @@ private static String getOutputSchema(final AirbyteStream stream, | |
* @param database JDBC database to connect to | ||
* @param sqlOperations interface for execution SQL queries | ||
* @param writeConfigs settings for each stream | ||
* @return | ||
*/ | ||
private static OnStartFunction onStartFunction(final JdbcDatabase database, | ||
final SqlOperations sqlOperations, | ||
final List<WriteConfig> writeConfigs) { | ||
final Collection<WriteConfig> writeConfigs) { | ||
return () -> { | ||
LOGGER.info("Preparing raw tables in destination started for {} streams", writeConfigs.size()); | ||
final List<String> queryList = new ArrayList<>(); | ||
|
@@ -168,12 +178,11 @@ private static OnStartFunction onStartFunction(final JdbcDatabase database, | |
* @param sqlOperations interface of SQL queries to execute | ||
* @param writeConfigs settings for each stream | ||
* @param catalog catalog of all streams to sync | ||
* @return | ||
*/ | ||
private static RecordWriter<AirbyteRecordMessage> recordWriterFunction(final JdbcDatabase database, | ||
final SqlOperations sqlOperations, | ||
final List<WriteConfig> writeConfigs, | ||
final ConfiguredAirbyteCatalog catalog) { | ||
private static RecordWriter<PartialAirbyteMessage> recordWriterFunction(final JdbcDatabase database, | ||
final SqlOperations sqlOperations, | ||
final List<WriteConfig> writeConfigs, | ||
final ConfiguredAirbyteCatalog catalog) { | ||
final Map<AirbyteStreamNameNamespacePair, WriteConfig> pairToWriteConfig = writeConfigs.stream() | ||
.collect(Collectors.toUnmodifiableMap(JdbcBufferedConsumerFactory::toNameNamespacePair, Function.identity())); | ||
|
||
|
@@ -190,11 +199,19 @@ private static RecordWriter<AirbyteRecordMessage> recordWriterFunction(final Jdb | |
|
||
/** | ||
* Tear down functionality | ||
* | ||
* @return | ||
*/ | ||
private static OnCloseFunction onCloseFunction() { | ||
return (hasFailed) -> {}; | ||
private static OnCloseFunction onCloseFunction(final boolean use1s1t, final TyperDeduper typerDeduper) { | ||
return (hasFailed) -> { | ||
if (use1s1t) { | ||
try { | ||
typerDeduper.typeAndDedupe(); | ||
typerDeduper.commitFinalTables(); | ||
typerDeduper.cleanup(); | ||
} catch (final Exception e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
}; | ||
} | ||
|
||
private static AirbyteStreamNameNamespacePair toNameNamespacePair(final WriteConfig config) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/* | ||
* Copyright (c) 2023 Airbyte, Inc., all rights reserved. | ||
*/ | ||
|
||
package io.airbyte.cdk.integrations.destination.jdbc; | ||
|
||
import io.airbyte.cdk.integrations.destination.buffered_stream_consumer.RecordWriter; | ||
import io.airbyte.cdk.integrations.destination.jdbc.constants.GlobalDataSizeConstants; | ||
import io.airbyte.cdk.integrations.destination_async.DestinationFlushFunction; | ||
import io.airbyte.cdk.integrations.destination_async.partial_messages.PartialAirbyteMessage; | ||
import io.airbyte.protocol.models.v0.AirbyteStreamNameNamespacePair; | ||
import io.airbyte.protocol.models.v0.StreamDescriptor; | ||
import java.util.stream.Stream; | ||
|
||
public class JdbcInsertFlushFunction implements DestinationFlushFunction { | ||
|
||
private final RecordWriter<PartialAirbyteMessage> recordWriter; | ||
|
||
public JdbcInsertFlushFunction(final RecordWriter<PartialAirbyteMessage> recordWriter) { | ||
this.recordWriter = recordWriter; | ||
} | ||
|
||
@Override | ||
public void flush(final StreamDescriptor desc, final Stream<PartialAirbyteMessage> stream) throws Exception { | ||
recordWriter.accept( | ||
new AirbyteStreamNameNamespacePair(desc.getName(), desc.getNamespace()), | ||
stream.toList()); | ||
} | ||
|
||
@Override | ||
public long getOptimalBatchSizeBytes() { | ||
// TODO tune this value - currently SqlOperationUtils partitions 10K records per insert statement, | ||
// but we'd like to stop doing that and instead control sql insert statement size via batch size. | ||
return GlobalDataSizeConstants.DEFAULT_MAX_BATCH_SIZE_BYTES; | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
import com.fasterxml.jackson.databind.JsonNode; | ||
import io.airbyte.cdk.db.jdbc.JdbcDatabase; | ||
import io.airbyte.cdk.integrations.base.JavaBaseConstants; | ||
import io.airbyte.cdk.integrations.destination_async.partial_messages.PartialAirbyteMessage; | ||
import io.airbyte.commons.exceptions.ConfigErrorException; | ||
import io.airbyte.commons.json.Jsons; | ||
import io.airbyte.protocol.models.v0.AirbyteRecordMessage; | ||
|
@@ -148,16 +149,20 @@ public boolean isValidData(final JsonNode data) { | |
|
||
@Override | ||
public final void insertRecords(final JdbcDatabase database, | ||
final List<AirbyteRecordMessage> records, | ||
final List<PartialAirbyteMessage> records, | ||
final String schemaName, | ||
final String tableName) | ||
throws Exception { | ||
dataAdapter.ifPresent(adapter -> records.forEach(airbyteRecordMessage -> adapter.adapt(airbyteRecordMessage.getData()))); | ||
dataAdapter.ifPresent(adapter -> records.forEach(airbyteRecordMessage -> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the only DataAdapter in existence is destination-postgres, because of https://airbytehq-team.slack.com/archives/C03C4AVJWG4/p1700605909300539 / #3476 so I don't feel bad about de+reserializing here |
||
final JsonNode data = Jsons.deserializeExact(airbyteRecordMessage.getSerialized()); | ||
adapter.adapt(data); | ||
airbyteRecordMessage.setSerialized(Jsons.serialize(data)); | ||
})); | ||
insertRecordsInternal(database, records, schemaName, tableName); | ||
} | ||
|
||
protected abstract void insertRecordsInternal(JdbcDatabase database, | ||
List<AirbyteRecordMessage> records, | ||
List<PartialAirbyteMessage> records, | ||
String schemaName, | ||
String tableName) | ||
throws Exception; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this isn't used by redshift, just adding it since I'm in this part of the code anyway