diff --git a/docs/docs/kafka-connect.md b/docs/docs/kafka-connect.md index a904a17a9968..671bcffeb688 100644 --- a/docs/docs/kafka-connect.md +++ b/docs/docs/kafka-connect.md @@ -62,6 +62,7 @@ for exactly-once semantics. This requires Kafka 2.5 or later. |--------------------------------------------|------------------------------------------------------------------------------------------------------------------| | iceberg.tables | Comma-separated list of destination tables | | iceberg.tables.dynamic-enabled | Set to `true` to route to a table specified in `routeField` instead of using `routeRegex`, default is `false` | +| iceberg.tables.route-with | Class to use for routing records from topics to tables | | iceberg.tables.route-field | For multi-table fan-out, the name of the field used to route records to tables | | iceberg.tables.default-commit-branch | Default branch for commits, main is used if not specified | | iceberg.tables.default-id-columns | Default comma-separated list of columns that identify a row in tables (primary key) | @@ -76,6 +77,8 @@ for exactly-once semantics. This requires Kafka 2.5 or later. | iceberg.table.\.id-columns | Comma-separated list of columns that identify a row in the table (primary key) | | iceberg.table.\
.partition-by | Comma-separated list of partition fields to use when creating the table | | iceberg.table.\
.route-regex | The regex used to match a record's `routeField` to a table | +| iceberg.table.\
.topics | Comma-separated list of topic names to route to the table | +| iceberg.table.\
.topic-regex | The regex used to match a record's Kafka topic to a table | | iceberg.control.topic | Name of the control topic, default is `control-iceberg` | | iceberg.control.commit.interval-ms | Commit interval in msec, default is 300,000 (5 min) | | iceberg.control.commit.timeout-ms | Commit timeout interval in msec, default is 30,000 (30 sec) | @@ -86,9 +89,28 @@ for exactly-once semantics. This requires Kafka 2.5 or later. | iceberg.hadoop.* | Properties passed through to the Hadoop configuration | | iceberg.kafka.* | Properties passed through to control topic Kafka client initialization | -If `iceberg.tables.dynamic-enabled` is `false` (the default) then you must specify `iceberg.tables`. If -`iceberg.tables.dynamic-enabled` is `true` then you must specify `iceberg.tables.route-field` which will -contain the name of the table. +If `iceberg.tables.route-with` is `org.apache.iceberg.connect.data.RecordRouter$DynamicRecordRouter`, +or `iceberg.tables.dynamic-enabled` is true, you must specify `iceberg.tables.route-field` which will contain the name +of the table. Tables will be dynamically created and named using the value in the field specified in the route field. + +If `iceberg.tables.route-with` is `org.apache.iceberg.connect.data.RecordRouter$StaticRecordRouter`, +or `iceberg.tables.route-field` is specified but `iceberg.tables.dynamic-enabled` is false, you must specify +`iceberg.tables` and `iceberg.table.\
.route-regex` for each table. Records will be routed only to a +table whose route regex matches the value in the route field. + +If `iceberg.tables.route-with` is `org.apache.iceberg.connect.data.RecordRouter$TopicNameRecordRouter`, then you must +specify `iceberg.tables` and `iceberg.table.\
.topics` for each table. Records will be routed to a table +whose topics list contains the record's topic. + +If `iceberg.tables.route-with` is `org.apache.iceberg.connect.data.RecordRouter$TopicRegexRecordRouter`, +then you must specify `iceberg.tables` and `iceberg.table.\
.topic-regex` for each table. Records will be +routed to a table whose topic regex matches the record's topic. + +If `iceberg.tables.route-with` is unset and there is no `iceberg.tables.route-field` specified, records are routed to +all the tables from all the topics. + +For backwards compatibility, setting `iceberg.tables.dynamic-enabled` to `true` will take precedence over the +`iceberg.tables.route-with` configuration. ### Kafka configuration @@ -350,3 +372,77 @@ See above for creating two tables. } } ``` + +### Topic name based multi-table routing + +This assumes that the source topics `events_create` and `events_list` already exist and are named as such. + +This example writes to tables whose `topics` property includes the topic of the record. The records read from +topic `events_create` are routed to the `default.events_create` table and records read from the topic `events_list` +are routed to the `default.events_list` table. + +#### Create two destination tables + +See above for creating two tables. + +#### Connector config + +```json +{ + "name": "events-sink", + "config": { + "connector.class": "org.apache.iceberg.connect.IcebergSinkConnector", + "tasks.max": "2", + "topics": "events_create,events_list", + "iceberg.tables": "default.events_list,default.events_create", + "iceberg.tables.route-with": "org.apache.iceberg.connect.data.RecordRouter$TopicNameRecordRouter", + "iceberg.table.default.events_list.topics": "events_list", + "iceberg.table.default.events_create.topics": "events_create", + "iceberg.catalog.type": "rest", + "iceberg.catalog.uri": "https://localhost", + "iceberg.catalog.credential": "", + "iceberg.catalog.warehouse": "" + } +} +``` + +### Topic regex based multi-table routing + +This assumes that the source topics `events_create` and `events_list` already exist and are named as such. + +This example writes to tables whose `topic-regex` property matches the topic of the record. The records read from +topic `events_create` are routed to the `default.events_create` table and records read from the topic `events_list` +are routed to the `default.events_list` table. The regex based routing is particularly useful when the source topics +of the connector are specified using `topic.regex` configuration. + +#### Create two destination tables + +See above for creating two tables. + +#### Connector config + +```json +{ + "name": "events-sink", + "config": { + "connector.class": "org.apache.iceberg.connect.IcebergSinkConnector", + "tasks.max": "2", + "topics": "events_create,events_list", + "iceberg.tables": "default.events_list,default.events_create", + "iceberg.tables.route-with": "org.apache.iceberg.connect.data.RecordRouter$TopicRegexRecordRouter", + "iceberg.table.default.events_list.topic-regex": "events_list", + "iceberg.table.default.events_create.topic-regex": "events_create", + "iceberg.catalog.type": "rest", + "iceberg.catalog.uri": "https://localhost", + "iceberg.catalog.credential": "", + "iceberg.catalog.warehouse": "" + } +} +``` + +### Custom routing of records + +Routing records from Kafka to tables can be customized by providing your own implementation of the `RecordRouter`. +Implementations must inherit `org.apache.iceberg.connect.data.RecordRouter`. To write a record to a table, +implementations can call the `writeToTable` method of the `RecordRouter`. You can then set `iceberg.tables.route-with` +to the class name of your plugin. The class must be available at runtime to the Kafka connect workers. diff --git a/kafka-connect/kafka-connect-runtime/src/integration/java/org/apache/iceberg/connect/IntegrationTopicRoutingTest.java b/kafka-connect/kafka-connect-runtime/src/integration/java/org/apache/iceberg/connect/IntegrationTopicRoutingTest.java new file mode 100644 index 000000000000..e5c706e0de4c --- /dev/null +++ b/kafka-connect/kafka-connect-runtime/src/integration/java/org/apache/iceberg/connect/IntegrationTopicRoutingTest.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.connect; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.time.Duration; +import java.time.Instant; +import java.util.List; +import org.apache.iceberg.DataFile; +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.Namespace; +import org.apache.iceberg.catalog.SupportsNamespaces; +import org.apache.iceberg.catalog.TableIdentifier; +import org.awaitility.Awaitility; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.NullSource; +import org.junit.jupiter.params.provider.ValueSource; + +public class IntegrationTopicRoutingTest extends IntegrationTestBase { + + private static final String TEST_DB = "test"; + private static final String TEST_TABLE1 = "table1"; + private static final String TEST_TABLE2 = "table2"; + private static final TableIdentifier TABLE_IDENTIFIER1 = TableIdentifier.of(TEST_DB, TEST_TABLE1); + private static final TableIdentifier TABLE_IDENTIFIER2 = TableIdentifier.of(TEST_DB, TEST_TABLE2); + private static final String TEST_TOPIC1 = "topic1"; + private static final String TEST_TOPIC2 = "topic2"; + + @BeforeEach + public void before() { + createTopic(TEST_TOPIC1, TEST_TOPIC_PARTITIONS); + createTopic(TEST_TOPIC2, TEST_TOPIC_PARTITIONS); + ((SupportsNamespaces) catalog()).createNamespace(Namespace.of(TEST_DB)); + } + + @AfterEach + public void after() { + context().stopConnector(connectorName()); + deleteTopic(TEST_TOPIC1); + deleteTopic(TEST_TOPIC2); + catalog().dropTable(TABLE_IDENTIFIER1); + catalog().dropTable(TABLE_IDENTIFIER2); + ((SupportsNamespaces) catalog()).dropNamespace(Namespace.of(TEST_DB)); + } + + @ParameterizedTest + @NullSource + @ValueSource(strings = "test_branch") + public void testTopicNameRouter(String branch) { + // partitioned table + catalog().createTable(TABLE_IDENTIFIER1, TestEvent.TEST_SCHEMA, TestEvent.TEST_SPEC); + // unpartitioned table + catalog().createTable(TABLE_IDENTIFIER2, TestEvent.TEST_SCHEMA); + + boolean useSchema = branch == null; // use a schema for one of the tests + // set offset reset to earliest so we don't miss any test messages + KafkaConnectUtils.Config connectorConfig = + new KafkaConnectUtils.Config(connectorName()) + .config("topics", String.format("%s,%s", TEST_TOPIC1, TEST_TOPIC2)) + .config("connector.class", IcebergSinkConnector.class.getName()) + .config("tasks.max", 2) + .config("consumer.override.auto.offset.reset", "earliest") + .config("key.converter", "org.apache.kafka.connect.json.JsonConverter") + .config("key.converter.schemas.enable", false) + .config("value.converter", "org.apache.kafka.connect.json.JsonConverter") + .config("value.converter.schemas.enable", useSchema) + .config( + "iceberg.tables.route-with", + "org.apache.iceberg.connect.data.RecordRouter$TopicNameRecordRouter") + .config( + "iceberg.tables", + String.format("%s.%s,%s.%s", TEST_DB, TEST_TABLE1, TEST_DB, TEST_TABLE2)) + .config(String.format("iceberg.table.%s.%s.topics", TEST_DB, TEST_TABLE1), TEST_TOPIC1) + .config(String.format("iceberg.table.%s.%s.topics", TEST_DB, TEST_TABLE2), TEST_TOPIC2) + .config("iceberg.control.commit.interval-ms", 1000) + .config("iceberg.control.commit.timeout-ms", Integer.MAX_VALUE) + .config("iceberg.kafka.auto.offset.reset", "earliest"); + + runTest(connectorConfig, branch, useSchema); + } + + @ParameterizedTest + @NullSource + @ValueSource(strings = "test_branch") + public void testTopicRegexRouter(String branch) { + // partitioned table + catalog().createTable(TABLE_IDENTIFIER1, TestEvent.TEST_SCHEMA, TestEvent.TEST_SPEC); + // unpartitioned table + catalog().createTable(TABLE_IDENTIFIER2, TestEvent.TEST_SCHEMA); + + boolean useSchema = branch == null; // use a schema for one of the tests + // set offset reset to earliest so we don't miss any test messages + KafkaConnectUtils.Config connectorConfig = + new KafkaConnectUtils.Config(connectorName()) + .config("topics", String.format("%s,%s", TEST_TOPIC1, TEST_TOPIC2)) + .config("connector.class", IcebergSinkConnector.class.getName()) + .config("tasks.max", 2) + .config("consumer.override.auto.offset.reset", "earliest") + .config("key.converter", "org.apache.kafka.connect.json.JsonConverter") + .config("key.converter.schemas.enable", false) + .config("value.converter", "org.apache.kafka.connect.json.JsonConverter") + .config("value.converter.schemas.enable", useSchema) + .config( + "iceberg.tables.route-with", + "org.apache.iceberg.connect.data.RecordRouter$TopicRegexRecordRouter") + .config( + "iceberg.tables", + String.format("%s.%s,%s.%s", TEST_DB, TEST_TABLE1, TEST_DB, TEST_TABLE2)) + .config(String.format("iceberg.table.%s.%s.topic-regex", TEST_DB, TEST_TABLE1), ".*1") + .config(String.format("iceberg.table.%s.%s.topic-regex", TEST_DB, TEST_TABLE2), ".*2") + .config("iceberg.control.commit.interval-ms", 1000) + .config("iceberg.control.commit.timeout-ms", Integer.MAX_VALUE) + .config("iceberg.kafka.auto.offset.reset", "earliest"); + + runTest(connectorConfig, branch, useSchema); + } + + private void runTest(KafkaConnectUtils.Config connectorConfig, String branch, boolean useSchema) { + + context().connectorCatalogProperties().forEach(connectorConfig::config); + + if (branch != null) { + connectorConfig.config("iceberg.tables.default-commit-branch", branch); + } + + if (!useSchema) { + connectorConfig.config("value.converter.schemas.enable", false); + } + + context().startConnector(connectorConfig); + + TestEvent event1 = new TestEvent(1, "type1", Instant.now(), "test1"); + TestEvent event2 = new TestEvent(2, "type2", Instant.now(), "test2"); + TestEvent event3 = new TestEvent(3, "type3", Instant.now(), "test3"); + + send(TEST_TOPIC1, event1, useSchema); + send(TEST_TOPIC2, event2, useSchema); + send(TEST_TOPIC2, event3, useSchema); + flush(); + + Awaitility.await() + .atMost(Duration.ofSeconds(30)) + .pollInterval(Duration.ofSeconds(1)) + .untilAsserted(this::assertSnapshotAdded); + + List files = dataFiles(TABLE_IDENTIFIER1, branch); + assertThat(files).hasSize(1); + assertThat(files.get(0).recordCount()).isEqualTo(1); + assertSnapshotProps(TABLE_IDENTIFIER1, branch); + + files = dataFiles(TABLE_IDENTIFIER2, branch); + assertThat(files).hasSize(2); + assertThat(files.get(0).recordCount()).isEqualTo(1); + assertThat(files.get(1).recordCount()).isEqualTo(1); + assertSnapshotProps(TABLE_IDENTIFIER2, branch); + } + + private void assertSnapshotAdded() { + Table table = catalog().loadTable(TABLE_IDENTIFIER1); + assertThat(table.snapshots()).hasSize(1); + table = catalog().loadTable(TABLE_IDENTIFIER2); + assertThat(table.snapshots()).hasSize(1); + } +} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConfig.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConfig.java index bf5b59a0f025..31cc66e466bf 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConfig.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/IcebergSinkConfig.java @@ -28,6 +28,7 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.iceberg.IcebergBuild; +import org.apache.iceberg.connect.data.RecordRouter; import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.base.Splitter; @@ -53,6 +54,8 @@ public class IcebergSinkConfig extends AbstractConfig { public static final String INTERNAL_TRANSACTIONAL_SUFFIX_PROP = "iceberg.coordinator.transactional.suffix"; private static final String ROUTE_REGEX = "route-regex"; + private static final String TOPICS = "topics"; + private static final String TOPIC_REGEX = "topic-regex"; private static final String ID_COLUMNS = "id-columns"; private static final String PARTITION_BY = "partition-by"; private static final String COMMIT_BRANCH = "commit-branch"; @@ -66,6 +69,7 @@ public class IcebergSinkConfig extends AbstractConfig { private static final String CATALOG_NAME_PROP = "iceberg.catalog"; private static final String TABLES_PROP = "iceberg.tables"; + private static final String TABLES_ROUTE_WITH_PROP = "iceberg.tables.route-with"; private static final String TABLES_DYNAMIC_PROP = "iceberg.tables.dynamic-enabled"; private static final String TABLES_ROUTE_FIELD_PROP = "iceberg.tables.route-field"; private static final String TABLES_DEFAULT_COMMIT_BRANCH = "iceberg.tables.default-commit-branch"; @@ -114,6 +118,12 @@ private static ConfigDef newConfigDef() { null, Importance.HIGH, "Comma-delimited list of destination tables"); + configDef.define( + TABLES_ROUTE_WITH_PROP, + ConfigDef.Type.CLASS, + null, + Importance.MEDIUM, + "Routing mechanism for the tables: FIELD_VALUE, FIELD_REGEX, TOPIC_NAME, TOPIC_REGEX"); configDef.define( TABLES_DYNAMIC_PROP, ConfigDef.Type.BOOLEAN, @@ -306,6 +316,10 @@ public boolean dynamicTablesEnabled() { return getBoolean(TABLES_DYNAMIC_PROP); } + public Class tablesRouteWith() { + return (Class) getClass(TABLES_ROUTE_WITH_PROP); + } + public String tablesRouteField() { return getString(TABLES_ROUTE_FIELD_PROP); } @@ -332,6 +346,11 @@ public TableSinkConfig tableConfig(String tableName) { String routeRegexStr = tableConfig.get(ROUTE_REGEX); Pattern routeRegex = routeRegexStr == null ? null : Pattern.compile(routeRegexStr); + String topics = tableConfig.get(TOPICS); + + String topicRegexStr = tableConfig.get(TOPIC_REGEX); + Pattern topicRegex = topicRegexStr == null ? null : Pattern.compile(topicRegexStr); + String idColumnsStr = tableConfig.getOrDefault(ID_COLUMNS, tablesDefaultIdColumns()); List idColumns = stringToList(idColumnsStr, ","); @@ -342,7 +361,8 @@ public TableSinkConfig tableConfig(String tableName) { String commitBranch = tableConfig.getOrDefault(COMMIT_BRANCH, tablesDefaultCommitBranch()); - return new TableSinkConfig(routeRegex, idColumns, partitionBy, commitBranch); + return new TableSinkConfig( + routeRegex, topics, topicRegex, idColumns, partitionBy, commitBranch); }); } diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/TableSinkConfig.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/TableSinkConfig.java index 0ecde1f7dd0b..66259de7099c 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/TableSinkConfig.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/TableSinkConfig.java @@ -24,13 +24,22 @@ public class TableSinkConfig { private final Pattern routeRegex; + private final String topics; + private final Pattern topicRegex; private final List idColumns; private final List partitionBy; private final String commitBranch; public TableSinkConfig( - Pattern routeRegex, List idColumns, List partitionBy, String commitBranch) { + Pattern routeRegex, + String topics, + Pattern topicRegex, + List idColumns, + List partitionBy, + String commitBranch) { this.routeRegex = routeRegex; + this.topics = topics; + this.topicRegex = topicRegex; this.idColumns = idColumns; this.partitionBy = partitionBy; this.commitBranch = commitBranch; @@ -40,6 +49,14 @@ public Pattern routeRegex() { return routeRegex; } + public String topics() { + return topics; + } + + public Pattern topicRegex() { + return topicRegex; + } + public List idColumns() { return idColumns; } diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordRouter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordRouter.java new file mode 100644 index 000000000000..d4135a2a0fc4 --- /dev/null +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/RecordRouter.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.connect.data; + +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.regex.Pattern; +import java.util.stream.Collectors; +import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.connect.IcebergSinkConfig; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.base.Splitter; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.relocated.com.google.common.collect.Maps; +import org.apache.kafka.connect.sink.SinkRecord; + +/** + * Abstract class for routing records to tables. + * + *

This class should be extended to create custom routing of records to tables. + */ +public abstract class RecordRouter { + + private final IcebergWriterFactory writerFactory; + private final IcebergSinkConfig config; + private final Map writers = Maps.newHashMap(); + + public RecordRouter(Catalog catalog, IcebergSinkConfig config) { + this.config = config; + this.writerFactory = new IcebergWriterFactory(catalog, config); + } + + public abstract void routeRecord(SinkRecord record); + + /** Get the configuration passed to the Kafka connect plugin. */ + public IcebergSinkConfig config() { + return config; + } + + /** + * Write the record to the table + * + * @param tableName The name of the table to write the record to. + * @param record The record to write to the table. + * @param ignoreMissingTable If true, missing tables are ignored and no error is thrown. + */ + public void writeToTable(String tableName, SinkRecord record, boolean ignoreMissingTable) { + writers + .computeIfAbsent( + tableName, notUsed -> writerFactory.createWriter(tableName, record, ignoreMissingTable)) + .write(record); + } + + /** + * Extract the value of a field from the record as a string. + * + * @param record The Kafka record to extract the value from. + * @param field The name of the field to extract. + * @return The value of the field in the record as a string. + */ + public String extractFieldFromRecordValue(SinkRecord record, String field) { + if (record.value() == null) { + return null; + } + Object value = RecordUtils.extractFromRecordValue(record.value(), field); + return value == null ? null : value.toString(); + } + + void close() { + writers.values().forEach(RecordWriter::close); + } + + List completeWrite() { + return writers.values().stream() + .flatMap(writer -> writer.complete().stream()) + .collect(Collectors.toList()); + } + + void clearWriters() { + writers.clear(); + } + + /** Route record to all the tables */ + public static class AllTablesRecordRouter extends RecordRouter { + public AllTablesRecordRouter(Catalog catalog, IcebergSinkConfig config) { + super(catalog, config); + } + + @Override + public void routeRecord(SinkRecord record) { + config().tables().forEach(tableName -> writeToTable(tableName, record, false)); + } + } + + /** Route records to tables based on a regex that matches the value of a field in the data. */ + public static class StaticRecordRouter extends RecordRouter { + private final String routeField; + private final Map tablePatterns = Maps.newHashMap(); + + public StaticRecordRouter(Catalog catalog, IcebergSinkConfig config) { + super(catalog, config); + this.routeField = config.tablesRouteField(); + Preconditions.checkNotNull(routeField, "Route field cannot be null with static routing"); + config + .tables() + .forEach( + tableName -> + tablePatterns.put(tableName, config.tableConfig(tableName).routeRegex())); + } + + @Override + public void routeRecord(SinkRecord record) { + String routeValue = extractFieldFromRecordValue(record, routeField); + if (routeValue != null) { + tablePatterns.forEach( + (tableName, tablePattern) -> { + if (tablePattern != null && tablePattern.matcher(routeValue).matches()) { + writeToTable(tableName, record, false); + } + }); + } + } + } + + /** Route records to the table specified in a field in the data. */ + public static class DynamicRecordRouter extends RecordRouter { + private final String routeField; + + public DynamicRecordRouter(Catalog catalog, IcebergSinkConfig config) { + super(catalog, config); + routeField = config.tablesRouteField(); + Preconditions.checkNotNull(routeField, "Route field cannot be null with dynamic routing"); + } + + @Override + public void routeRecord(SinkRecord record) { + String routeValue = extractFieldFromRecordValue(record, routeField); + if (routeValue != null) { + String tableName = routeValue.toLowerCase(Locale.ROOT); + writeToTable(tableName, record, true); + } + } + } + + /** Route records to tables with the same name as the record's topic. */ + public static class TopicNameRecordRouter extends RecordRouter { + private final Map> topicTablesMap = Maps.newHashMap(); + + public TopicNameRecordRouter(Catalog catalog, IcebergSinkConfig config) { + super(catalog, config); + config + .tables() + .forEach( + tableName -> { + Iterable topics = + Splitter.on(',').split(config.tableConfig(tableName).topics()); + for (String topic : topics) { + topicTablesMap.computeIfAbsent(topic, k -> Lists.newArrayList()).add(tableName); + } + }); + } + + @Override + public void routeRecord(SinkRecord record) { + topicTablesMap + .getOrDefault(record.topic(), Collections.emptyList()) + .forEach(tableName -> writeToTable(tableName, record, true)); + } + } + + /** Route records to tables using a regex match the topic of the record. */ + public static class TopicRegexRecordRouter extends RecordRouter { + private final Map tablePatterns = Maps.newHashMap(); + + public TopicRegexRecordRouter(Catalog catalog, IcebergSinkConfig config) { + super(catalog, config); + config + .tables() + .forEach( + tableName -> + tablePatterns.putIfAbsent(tableName, config.tableConfig(tableName).topicRegex())); + } + + @Override + public void routeRecord(SinkRecord record) { + // If the mapping isn't pre-computed, check against each table's pattern + tablePatterns.forEach( + (tableName, tablePattern) -> { + if (tablePattern != null && tablePattern.matcher(record.topic()).matches()) { + writeToTable(tableName, record, false); + } + }); + } + } +} diff --git a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SinkWriter.java b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SinkWriter.java index f81155e13777..7da8182dd5fa 100644 --- a/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SinkWriter.java +++ b/kafka-connect/kafka-connect/src/main/java/org/apache/iceberg/connect/data/SinkWriter.java @@ -18,47 +18,57 @@ */ package org.apache.iceberg.connect.data; +import java.lang.reflect.InvocationTargetException; import java.time.Instant; import java.time.OffsetDateTime; import java.time.ZoneOffset; import java.util.Collection; import java.util.List; -import java.util.Locale; import java.util.Map; -import java.util.regex.Pattern; -import java.util.stream.Collectors; import org.apache.iceberg.catalog.Catalog; import org.apache.iceberg.connect.IcebergSinkConfig; -import org.apache.iceberg.relocated.com.google.common.base.Preconditions; import org.apache.iceberg.relocated.com.google.common.collect.Maps; import org.apache.kafka.common.TopicPartition; import org.apache.kafka.connect.sink.SinkRecord; public class SinkWriter { - private final IcebergSinkConfig config; - private final IcebergWriterFactory writerFactory; - private final Map writers; private final Map sourceOffsets; + private final RecordRouter router; public SinkWriter(Catalog catalog, IcebergSinkConfig config) { - this.config = config; - this.writerFactory = new IcebergWriterFactory(catalog, config); - this.writers = Maps.newHashMap(); this.sourceOffsets = Maps.newHashMap(); + if (config.dynamicTablesEnabled()) { + router = new RecordRouter.DynamicRecordRouter(catalog, config); + } else if (config.tablesRouteWith() == null && config.tablesRouteField() != null) { + router = new RecordRouter.StaticRecordRouter(catalog, config); + } else if (config.tablesRouteWith() != null) { + try { + router = + config + .tablesRouteWith() + .getDeclaredConstructor(Catalog.class, IcebergSinkConfig.class) + .newInstance(catalog, config); + } catch (NoSuchMethodException + | InstantiationException + | IllegalAccessException + | InvocationTargetException e) { + throw new IllegalArgumentException( + "Cannot create router from iceberg.tables.route-with", e); + } + } else { + router = new RecordRouter.AllTablesRecordRouter(catalog, config); + } } public void close() { - writers.values().forEach(RecordWriter::close); + router.close(); } public SinkWriterResult completeWrite() { - List writerResults = - writers.values().stream() - .flatMap(writer -> writer.complete().stream()) - .collect(Collectors.toList()); + List writerResults = router.completeWrite(); Map offsets = Maps.newHashMap(sourceOffsets); - writers.clear(); + router.clearWriters(); sourceOffsets.clear(); return new SinkWriterResult(writerResults, offsets); @@ -79,63 +89,6 @@ private void save(SinkRecord record) { new TopicPartition(record.topic(), record.kafkaPartition()), new Offset(record.kafkaOffset() + 1, timestamp)); - if (config.dynamicTablesEnabled()) { - routeRecordDynamically(record); - } else { - routeRecordStatically(record); - } - } - - private void routeRecordStatically(SinkRecord record) { - String routeField = config.tablesRouteField(); - - if (routeField == null) { - // route to all tables - config - .tables() - .forEach( - tableName -> { - writerForTable(tableName, record, false).write(record); - }); - - } else { - String routeValue = extractRouteValue(record.value(), routeField); - if (routeValue != null) { - config - .tables() - .forEach( - tableName -> { - Pattern regex = config.tableConfig(tableName).routeRegex(); - if (regex != null && regex.matcher(routeValue).matches()) { - writerForTable(tableName, record, false).write(record); - } - }); - } - } - } - - private void routeRecordDynamically(SinkRecord record) { - String routeField = config.tablesRouteField(); - Preconditions.checkNotNull(routeField, "Route field cannot be null with dynamic routing"); - - String routeValue = extractRouteValue(record.value(), routeField); - if (routeValue != null) { - String tableName = routeValue.toLowerCase(Locale.ROOT); - writerForTable(tableName, record, true).write(record); - } - } - - private String extractRouteValue(Object recordValue, String routeField) { - if (recordValue == null) { - return null; - } - Object routeValue = RecordUtils.extractFromRecordValue(recordValue, routeField); - return routeValue == null ? null : routeValue.toString(); - } - - private RecordWriter writerForTable( - String tableName, SinkRecord sample, boolean ignoreMissingTable) { - return writers.computeIfAbsent( - tableName, notUsed -> writerFactory.createWriter(tableName, sample, ignoreMissingTable)); + router.routeRecord(record); } } diff --git a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/SinkWriterTest.java b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/SinkWriterTest.java index 4a17b926fc56..d0123bfa3576 100644 --- a/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/SinkWriterTest.java +++ b/kafka-connect/kafka-connect/src/test/java/org/apache/iceberg/connect/data/SinkWriterTest.java @@ -105,6 +105,43 @@ public void testDefaultNoRoute() { assertThat(writerResults.size()).isEqualTo(0); } + @Test + public void testTopicRegexRoute() { + TableSinkConfig tableConfig = mock(TableSinkConfig.class); + when(tableConfig.topicRegex()).thenReturn(Pattern.compile("topic")); + + IcebergSinkConfig config = mock(IcebergSinkConfig.class); + when(config.tablesRouteWith()) + .thenAnswer(invocation -> RecordRouter.TopicRegexRecordRouter.class); + when(config.tables()).thenReturn(ImmutableList.of(TABLE_IDENTIFIER.toString())); + when(config.tableConfig(any())).thenReturn(tableConfig); + + Map value = ImmutableMap.of(); + List writerResults = sinkWriterTest(value, config); + assertThat(writerResults.size()).isEqualTo(1); + IcebergWriterResult writerResult = writerResults.get(0); + assertThat(writerResult.tableIdentifier()).isEqualTo(TABLE_IDENTIFIER); + } + + @Test + public void testTopicNameRoute() { + TableSinkConfig tableConfig = mock(TableSinkConfig.class); + when(tableConfig.topics()).thenReturn("topic"); + + System.out.println(RecordRouter.DynamicRecordRouter.class.getName()); + IcebergSinkConfig config = mock(IcebergSinkConfig.class); + when(config.tablesRouteWith()) + .thenAnswer(invocation -> RecordRouter.TopicNameRecordRouter.class); + when(config.tables()).thenReturn(ImmutableList.of(TABLE_IDENTIFIER.toString())); + when(config.tableConfig(any())).thenReturn(tableConfig); + + Map value = ImmutableMap.of(); + List writerResults = sinkWriterTest(value, config); + assertThat(writerResults.size()).isEqualTo(1); + IcebergWriterResult writerResult = writerResults.get(0); + assertThat(writerResult.tableIdentifier()).isEqualTo(TABLE_IDENTIFIER); + } + @Test public void testStaticRoute() { TableSinkConfig tableConfig = mock(TableSinkConfig.class);