diff --git a/.github/workflows/flink_cdc.yml b/.github/workflows/flink_cdc.yml index 22b0193357..81d481c7fe 100644 --- a/.github/workflows/flink_cdc.yml +++ b/.github/workflows/flink_cdc.yml @@ -56,7 +56,8 @@ env: flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-doris,\ flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-starrocks,\ flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka,\ - flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-paimon" + flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-paimon,\ + flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute" MODULES_MYSQL: "\ flink-cdc-connect/flink-cdc-source-connectors/flink-connector-mysql-cdc,\ diff --git a/docs/content.zh/docs/connectors/pipeline-connectors/maxcompute.md b/docs/content.zh/docs/connectors/pipeline-connectors/maxcompute.md new file mode 100644 index 0000000000..dd68721a10 --- /dev/null +++ b/docs/content.zh/docs/connectors/pipeline-connectors/maxcompute.md @@ -0,0 +1,328 @@ +--- +title: "MaxCompute" +weight: 7 +type: docs +aliases: + - /connectors/maxcompute +--- + + + +# MaxCompute Connector + +MaxCompute Pipeline 连接器可以用作 Pipeline 的 *Data Sink*,将数据写入[MaxCompute](https://www.aliyun.com/product/odps)。 +本文档介绍如何设置 MaxCompute Pipeline 连接器。 + +## 连接器的功能 + +* 自动建表 +* 表结构变更同步 +* 数据实时同步 + +## 示例 + +从 MySQL 读取数据同步到 MaxCompute 的 Pipeline 可以定义如下: + +```yaml +source: + type: mysql + name: MySQL Source + hostname: 127.0.0.1 + port: 3306 + username: admin + password: pass + tables: adb.\.*, bdb.user_table_[0-9]+, [app|web].order_\.* + server-id: 5401-5404 + +sink: + type: maxcompute + name: MaxCompute Sink + accessId: ak + accessKey: sk + endpoint: endpoint + project: flink_cdc + bucketSize: 8 + +pipeline: + name: MySQL to MaxCompute Pipeline + parallelism: 2 +``` + +## 连接器配置项 + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OptionRequiredDefaultTypeDescription
typerequired(none)String指定要使用的连接器, 这里需要设置成 'maxcompute'.
nameoptional(none)StringSink 的名称.
accessIdrequired(none)String阿里云账号或RAM用户的AccessKey ID。您可以进入 + AccessKey管理页面 获取AccessKey ID。
accessKeyrequired(none)StringAccessKey ID对应的AccessKey Secret。您可以进入 + AccessKey管理页面 获取AccessKey Secret。
endpointrequired(none)StringMaxCompute服务的连接地址。您需要根据创建MaxCompute项目时选择的地域以及网络连接方式配置Endpoint。各地域及网络对应的Endpoint值,请参见 + Endpoint
projectrequired(none)StringMaxCompute项目名称。您可以登录 + MaxCompute控制台,在 工作区 > 项目管理 页面获取MaxCompute项目名称。
tunnelEndpointoptional(none)StringMaxCompute Tunnel服务的连接地址,通常这项配置可以根据指定的project所在的region进行自动路由。仅在使用代理等特殊网络环境下使用该配置。
quotaNameoptional(none)StringMaxCompute 数据传输使用的独享资源组名称,如不指定该配置,则使用共享资源组。详情可以参考 + 使用 Maxcompute 独享资源组
stsTokenoptional(none)String当使用RAM角色颁发的短时有效的访问令牌(STS Token)进行鉴权时,需要指定该参数。
bucketsNumoptional16Integer自动创建 MaxCompute Delta 表时使用的桶数。使用方式可以参考 + Delta Table 概述
compressAlgorithmoptionalzlibString写入MaxCompute时使用的数据压缩算法,当前支持raw(不进行压缩),zlibsnappy
totalBatchSizeoptional64MBString内存中缓冲的数据量大小,单位为分区级(非分区表单位为表级),不同分区(表)的缓冲区相互独立,达到阈值后数据写入到MaxCompute。
bucketBatchSizeoptional4MBString内存中缓冲的数据量大小,单位为桶级,仅写入 Delta 表时生效。不同数据桶的缓冲区相互独立,达到阈值后将该桶数据写入到MaxCompute。
numCommitThreadsoptional16Integercheckpoint阶段,能够同时处理的分区(表)数量。
numFlushConcurrentoptional4Integer写入数据到MaxCompute时,能够同时写入的桶数量。仅写入 Delta 表时生效。
+
+ +## 使用说明 + +* 链接器 支持自动建表,将MaxCompute表与源表的位置关系、数据类型进行自动映射(参见下文映射表),当源表有主键时,自动创建 + MaxCompute Delta 表,否则创建普通 MaxCompute 表(Append表) +* 当写入普通 MaxCompute 表(Append表)时,会忽略`delete`操作,`update`操作会被视为`insert`操作 +* 目前仅支持 at-least-once,Delta 表由于主键特性能够实现幂等写 +* 对于表结构变更同步 + * 新增列只能添加到最后一列 + * 修改列类型,只能修改为兼容的类型。兼容表可以参考[ALTER TABLE](https://help.aliyun.com/zh/maxcompute/user-guide/alter-table) + +## 表位置映射 + +链接器自动建表时,使用如下映射关系,将源表的位置信息映射到MaxCompute表的位置。注意,当MaxCompute项目不支持Schema模型时,每个同步任务仅能同步一个Mysql +Database。(其他Datasource同理,链接器会忽略TableId.namespace信息) + +
+ + + + + + + + + + + + + + + + + + + + + + + + + +
Flink CDC 中抽象MaxCompute 位置Mysql 位置
配置文件中projectproject(none)
TableId.namespaceschema(仅当MaxCompute项目支持Schema模型时,如不支持,将忽略该配置)database
TableId.tableNametabletable
+
+ +## 数据类型映射 + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Flink TypeMaxCompute Type
CHAR/VARCHARSTRING
BOOLEANBOOLEAN
BINARY/VARBINARYBINARY
DECIMALDECIMAL
TINYINTTINYINT
SMALLINTSMALLINT
INTEGERINTEGER
BIGINTBIGINT
FLOATFLOAT
DOUBLEDOUBLE
TIME_WITHOUT_TIME_ZONESTRING
DATEDATE
TIMESTAMP_WITHOUT_TIME_ZONETIMESTAMP_NTZ
TIMESTAMP_WITH_LOCAL_TIME_ZONETIMESTAMP
TIMESTAMP_WITH_TIME_ZONETIMESTAMP
ARRAYARRAY
MAPMAP
ROWSTRUCT
+
+ +{{< top >}} diff --git a/docs/content/docs/connectors/pipeline-connectors/maxcompute.md b/docs/content/docs/connectors/pipeline-connectors/maxcompute.md new file mode 100644 index 0000000000..d1d39a6457 --- /dev/null +++ b/docs/content/docs/connectors/pipeline-connectors/maxcompute.md @@ -0,0 +1,322 @@ +--- +title: "MaxCompute" +weight: 7 +type: docs +aliases: + - /connectors/maxcompute +--- + + + +# MaxCompute Connector + +MaxCompute connector can be used as the *Data Sink* of the pipeline, and write data +to [MaxCompute](https://www.aliyun.com/product/odps). This document describes how to set up the MaxCompute connector. + +## What can the connector do? + +* Create table automatically if not exist +* Schema change synchronization +* Data synchronization + +## Example + +The pipeline for reading data from MySQL and sink to MaxCompute can be defined as follows: + +```yaml +source: + type: mysql + name: MySQL Source + hostname: 127.0.0.1 + port: 3306 + username: admin + password: pass + tables: adb.\.*, bdb.user_table_[0-9]+, [app|web].order_\.* + server-id: 5401-5404 + +sink: + type: maxcompute + name: MaxCompute Sink + accessId: ak + accessKey: sk + endpoint: endpoint + project: flink_cdc + bucketSize: 8 + +pipeline: + name: MySQL to MaxCompute Pipeline + parallelism: 2 +``` + +## Connector Options + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
OptionRequiredDefaultTypeDescription
typerequired(none)StringSpecify what connector to use, here should be 'maxcompute'.
nameoptional(none)StringThe name of the sink.
accessIdrequired(none)StringAccessKey ID of Alibaba Cloud account or RAM user. You can enter + AccessKey management page Obtain AccessKey ID.
accessKeyrequired(none)StringAccessKey Secret corresponding to AccessKey ID. You can enter + AccessKey management page Obtain AccessKey Secret.
endpointrequired(none)StringThe connection address for the MaxCompute service. You need to configure the Endpoint based on the region selected when creating the MaxCompute project and the network connection method. For values corresponding to each region and network, please refer to Endpoint.
projectrequired(none)StringThe name of the MaxCompute project. You can log in to the MaxCompute console and obtain the MaxCompute project name on the Workspace > Project Management page.
tunnelEndpointoptional(none)StringThe connection address for the MaxCompute Tunnel service. Typically, this configuration can be auto-routed based on the region where the specified project is located. It is used only in special network environments such as when using a proxy.
quotaNameoptional(none)StringThe name of the exclusive resource group for MaxCompute data transfer. If not specified, the shared resource group is used. For details, refer to Using exclusive resource groups for Maxcompute
stsTokenoptional(none)StringWhen using a temporary access token (STS Token) issued by a RAM role for authentication, this parameter must be specified.
bucketsNumoptional16IntegerThe number of buckets used when auto-creating MaxCompute Delta tables. For usage, refer to Delta Table Overview
compressAlgorithmoptionalzlibStringThe data compression algorithm used when writing to MaxCompute. Currently supports raw (no compression), zlib, and snappy.
totalBatchSizeoptional64MBStringThe size of the data buffer in memory, by partition level (for non-partitioned tables, by table level). Buffers for different partitions (tables) are independent, and data is written to MaxCompute when the threshold is reached.
bucketBatchSizeoptional4MBStringThe size of the data buffer in memory, by bucket level. This is effective only when writing to Delta tables. Buffers for different data buckets are independent, and the bucket data is written to MaxCompute when the threshold is reached.
numCommitThreadsoptional16IntegerThe number of partitions (tables) that can be processed simultaneously during the checkpoint stage.
numFlushConcurrentoptional4IntegerThe number of buckets that can be written to MaxCompute simultaneously. This is effective only when writing to Delta tables.
+
+ +## Usage Instructions + +* The connector supports automatic table creation, automatically mapping the location relationship and data types between MaxCompute tables and source tables (see the mapping table below). When the source table has a primary key, a MaxCompute Delta table is automatically created; otherwise, a regular MaxCompute table (Append table) is created. +* When writing to a regular MaxCompute table (Append table), the delete operation will be ignored, and the update operation will be treated as an insert operation. +* Currently, only at-least-once is supported. Delta tables can achieve idempotent writes due to their primary key characteristics. +* For synchronization of table structure changes: + * A new column can only be added as the last column. + * Modifying a column type can only be changed to a compatible type. For compatible types, refer to[ALTER TABLE](https://help.aliyun.com/zh/maxcompute/user-guide/alter-table) + +## Table Location Mapping +When the connector automatically creates tables, it uses the following mapping relationship to map the location information of the source tables to the location of the MaxCompute tables. Note that when the MaxCompute project does not support the Schema model, each synchronization task can only synchronize one MySQL Database. (The same applies to other DataSources, the connector will ignore the TableId.namespace information) + +
+ + + + + + + + + + + + + + + + + + + + + + + + + +
Abstract in Flink CDC MaxCompute LocationMySQL Location
project in the configuration fileproject(none)
TableId.namespaceschema (Only when the MaxCompute project supports the Schema model. If not supported, this configuration will be ignored)database
TableId.tableNametabletable
+
+ +## Data Type Mapping + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Flink TypeMaxCompute Type
CHAR/VARCHARSTRING
BOOLEANBOOLEAN
BINARY/VARBINARYBINARY
DECIMALDECIMAL
TINYINTTINYINT
SMALLINTSMALLINT
INTEGERINTEGER
BIGINTBIGINT
FLOATFLOAT
DOUBLEDOUBLE
TIME_WITHOUT_TIME_ZONESTRING
DATEDATE
TIMESTAMP_WITHOUT_TIME_ZONETIMESTAMP_NTZ
TIMESTAMP_WITH_LOCAL_TIME_ZONETIMESTAMP
TIMESTAMP_WITH_TIME_ZONETIMESTAMP
ARRAYARRAY
MAPMAP
ROWSTRUCT
+
+ +{{< top >}} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/pom.xml b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/pom.xml new file mode 100644 index 0000000000..cd81b5fe70 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/pom.xml @@ -0,0 +1,116 @@ + + + + 4.0.0 + + org.apache.flink + flink-cdc-pipeline-connectors + ${revision} + + flink-cdc-pipeline-connector-maxcompute + flink-cdc-pipeline-connector-maxcompute + + + + org.apache.flink + flink-cdc-composer + ${project.version} + provided + + + + org.apache.flink + flink-runtime + ${flink.version} + provided + + + + org.apache.flink + flink-streaming-java + ${flink.version} + provided + + + + com.aliyun.odps + odps-sdk-core + 0.48.6-public + + + com.fasterxml.jackson.core + jackson-databind + + + aliyun-java-auth + com.aliyun + + + commons-codec + commons-codec + + + guava + com.google.guava + + + icu4j + com.ibm.icu + + + snappy-java + org.xerial.snappy + + + + + + org.testcontainers + testcontainers + 1.19.8 + test + + + + + + + org.apache.maven.plugins + maven-shade-plugin + + + shade-flink + package + + shade + + + false + + + *:* + + + + + + + + + diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/MaxComputeDataSink.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/MaxComputeDataSink.java new file mode 100644 index 0000000000..d147333766 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/MaxComputeDataSink.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute; + +import org.apache.flink.cdc.common.event.DataChangeEvent; +import org.apache.flink.cdc.common.function.HashFunctionProvider; +import org.apache.flink.cdc.common.sink.DataSink; +import org.apache.flink.cdc.common.sink.EventSinkProvider; +import org.apache.flink.cdc.common.sink.FlinkSinkProvider; +import org.apache.flink.cdc.common.sink.MetadataApplier; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeWriteOptions; +import org.apache.flink.cdc.connectors.maxcompute.sink.MaxComputeEventSink; +import org.apache.flink.cdc.connectors.maxcompute.sink.MaxComputeHashFunctionProvider; + +/** A {@link DataSink} for "MaxCompute" connector. */ +public class MaxComputeDataSink implements DataSink { + private final MaxComputeOptions options; + private final MaxComputeWriteOptions writeOptions; + + public MaxComputeDataSink(MaxComputeOptions options, MaxComputeWriteOptions writeOptions) { + this.options = options; + this.writeOptions = writeOptions; + } + + @Override + public EventSinkProvider getEventSinkProvider() { + return FlinkSinkProvider.of(new MaxComputeEventSink(options, writeOptions)); + } + + @Override + public MetadataApplier getMetadataApplier() { + return new MaxComputeMetadataApplier(options); + } + + @Override + public HashFunctionProvider getDataChangeEventHashFunctionProvider() { + return new MaxComputeHashFunctionProvider(options); + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/MaxComputeDataSinkFactory.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/MaxComputeDataSinkFactory.java new file mode 100644 index 0000000000..fb45623b52 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/MaxComputeDataSinkFactory.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute; + +import org.apache.flink.cdc.common.configuration.ConfigOption; +import org.apache.flink.cdc.common.configuration.Configuration; +import org.apache.flink.cdc.common.factories.DataSinkFactory; +import org.apache.flink.cdc.common.pipeline.PipelineOptions; +import org.apache.flink.cdc.common.sink.DataSink; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeWriteOptions; +import org.apache.flink.configuration.MemorySize; + +import java.util.HashSet; +import java.util.Set; + +/** A {@link DataSinkFactory} for "MaxCompute" connector. */ +public class MaxComputeDataSinkFactory implements DataSinkFactory { + + private static final String IDENTIFIER = "maxcompute"; + + @Override + public DataSink createDataSink(Context context) { + MaxComputeOptions options = + extractMaxComputeOptions( + context.getFactoryConfiguration(), context.getPipelineConfiguration()); + MaxComputeWriteOptions writeOptions = + extractMaxComputeWriteOptions(context.getFactoryConfiguration()); + return new MaxComputeDataSink(options, writeOptions); + } + + private MaxComputeOptions extractMaxComputeOptions( + Configuration factoryConfiguration, Configuration pipelineConfiguration) { + String accessId = factoryConfiguration.get(MaxComputeDataSinkOptions.ACCESS_ID); + String accessKey = factoryConfiguration.get(MaxComputeDataSinkOptions.ACCESS_KEY); + String endpoint = factoryConfiguration.get(MaxComputeDataSinkOptions.ENDPOINT); + String project = factoryConfiguration.get(MaxComputeDataSinkOptions.PROJECT); + String tunnelEndpoint = factoryConfiguration.get(MaxComputeDataSinkOptions.TUNNEL_ENDPOINT); + String quotaName = factoryConfiguration.get(MaxComputeDataSinkOptions.QUOTA_NAME); + String stsToken = factoryConfiguration.get(MaxComputeDataSinkOptions.STS_TOKEN); + int bucketSize = factoryConfiguration.get(MaxComputeDataSinkOptions.BUCKETS_NUM); + + String schemaOperatorUid = + pipelineConfiguration.get(PipelineOptions.PIPELINE_SCHEMA_OPERATOR_UID); + return MaxComputeOptions.builder(accessId, accessKey, endpoint, project) + .withTunnelEndpoint(tunnelEndpoint) + .withQuotaName(quotaName) + .withStsToken(stsToken) + .withBucketSize(bucketSize) + .withSchemaOperatorUid(schemaOperatorUid) + .build(); + } + + private MaxComputeWriteOptions extractMaxComputeWriteOptions( + Configuration factoryConfiguration) { + int numCommitThread = + factoryConfiguration.get(MaxComputeDataSinkOptions.NUM_COMMIT_THREADS); + String compressAlgorithm = + factoryConfiguration.get(MaxComputeDataSinkOptions.COMPRESS_ALGORITHM); + int flushConcurrent = + factoryConfiguration.get(MaxComputeDataSinkOptions.NUM_FLUSH_CONCURRENT); + long maxBufferSize = + MemorySize.parse( + factoryConfiguration.get( + MaxComputeDataSinkOptions.TOTAL_BATCH_SIZE)) + .getBytes(); + long maxSlotSize = + MemorySize.parse( + factoryConfiguration.get( + MaxComputeDataSinkOptions.BUCKET_BATCH_SIZE)) + .getBytes(); + + return MaxComputeWriteOptions.builder() + .withNumCommitThread(numCommitThread) + .withCompressAlgorithm(compressAlgorithm) + .withFlushConcurrent(flushConcurrent) + .withMaxBufferSize(maxBufferSize) + .withSlotBufferSize(maxSlotSize) + .build(); + } + + @Override + public String identifier() { + return IDENTIFIER; + } + + @Override + public Set> requiredOptions() { + Set> requiredOptions = new HashSet<>(); + requiredOptions.add(MaxComputeDataSinkOptions.ACCESS_ID); + requiredOptions.add(MaxComputeDataSinkOptions.ACCESS_KEY); + requiredOptions.add(MaxComputeDataSinkOptions.ENDPOINT); + requiredOptions.add(MaxComputeDataSinkOptions.PROJECT); + return requiredOptions; + } + + @Override + public Set> optionalOptions() { + Set> optionalOptions = new HashSet<>(); + // options + optionalOptions.add(MaxComputeDataSinkOptions.TUNNEL_ENDPOINT); + optionalOptions.add(MaxComputeDataSinkOptions.QUOTA_NAME); + optionalOptions.add(MaxComputeDataSinkOptions.STS_TOKEN); + optionalOptions.add(MaxComputeDataSinkOptions.BUCKETS_NUM); + // write options + optionalOptions.add(MaxComputeDataSinkOptions.NUM_COMMIT_THREADS); + optionalOptions.add(MaxComputeDataSinkOptions.COMPRESS_ALGORITHM); + optionalOptions.add(MaxComputeDataSinkOptions.NUM_FLUSH_CONCURRENT); + optionalOptions.add(MaxComputeDataSinkOptions.TOTAL_BATCH_SIZE); + optionalOptions.add(MaxComputeDataSinkOptions.BUCKET_BATCH_SIZE); + + return optionalOptions; + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/MaxComputeDataSinkOptions.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/MaxComputeDataSinkOptions.java new file mode 100644 index 0000000000..e28272b910 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/MaxComputeDataSinkOptions.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute; + +import org.apache.flink.cdc.common.configuration.ConfigOption; +import org.apache.flink.cdc.common.configuration.ConfigOptions; + +/** Options for MaxCompute Data Sink. */ +public class MaxComputeDataSinkOptions { + // basic options. + public static final ConfigOption ACCESS_ID = + ConfigOptions.key("accessId") + .stringType() + .noDefaultValue() + .withDescription("MaxCompute user access id."); + + public static final ConfigOption ACCESS_KEY = + ConfigOptions.key("accessKey") + .stringType() + .noDefaultValue() + .withDescription("MaxCompute user access key."); + + public static final ConfigOption ENDPOINT = + ConfigOptions.key("endpoint") + .stringType() + .noDefaultValue() + .withDescription("MaxCompute endpoint."); + + public static final ConfigOption PROJECT = + ConfigOptions.key("project") + .stringType() + .noDefaultValue() + .withDescription("MaxCompute project."); + + public static final ConfigOption TUNNEL_ENDPOINT = + ConfigOptions.key("tunnelEndpoint") + .stringType() + .noDefaultValue() + .withDescription("MaxCompute tunnel end point."); + public static final ConfigOption QUOTA_NAME = + ConfigOptions.key("quotaName") + .stringType() + .noDefaultValue() + .withDescription( + "MaxCompute tunnel quota name, note that not quota nick-name."); + + public static final ConfigOption STS_TOKEN = + ConfigOptions.key("stsToken") + .stringType() + .noDefaultValue() + .withDescription("MaxCompute sts token."); + + public static final ConfigOption BUCKETS_NUM = + ConfigOptions.key("bucketsNum") + .intType() + .defaultValue(16) + .withDescription( + "The batch size of MaxCompute table when automatically create table."); + + // write options. + public static final ConfigOption COMPRESS_ALGORITHM = + ConfigOptions.key("compressAlgorithm") + .stringType() + .defaultValue("zlib") + .withDescription( + "The compress algorithm of data upload to MaxCompute, support 'zlib', 'snappy', 'raw'."); + + public static final ConfigOption TOTAL_BATCH_SIZE = + ConfigOptions.key("totalBatchSize") + .stringType() + .defaultValue("64MB") + .withDescription("The max batch size of data upload to MaxCompute."); + + public static final ConfigOption BUCKET_BATCH_SIZE = + ConfigOptions.key("bucketBatchSize") + .stringType() + .defaultValue("4MB") + .withDescription( + "The max batch size of data per bucket when upload to MaxCompute"); + + public static final ConfigOption NUM_COMMIT_THREADS = + ConfigOptions.key("numCommitThreads") + .intType() + .defaultValue(16) + .withDescription("The number of threads used to commit data to MaxCompute."); + + public static final ConfigOption NUM_FLUSH_CONCURRENT = + ConfigOptions.key("numFlushConcurrent") + .intType() + .defaultValue(4) + .withDescription("The number of concurrent with flush bucket data."); +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/MaxComputeMetadataApplier.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/MaxComputeMetadataApplier.java new file mode 100644 index 0000000000..8eb6c1f384 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/MaxComputeMetadataApplier.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute; + +import org.apache.flink.cdc.common.event.AddColumnEvent; +import org.apache.flink.cdc.common.event.AlterColumnTypeEvent; +import org.apache.flink.cdc.common.event.CreateTableEvent; +import org.apache.flink.cdc.common.event.DropColumnEvent; +import org.apache.flink.cdc.common.event.RenameColumnEvent; +import org.apache.flink.cdc.common.event.SchemaChangeEvent; +import org.apache.flink.cdc.common.sink.MetadataApplier; +import org.apache.flink.cdc.connectors.maxcompute.common.UncheckedOdpsException; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.utils.MaxComputeUtils; +import org.apache.flink.cdc.connectors.maxcompute.utils.SchemaEvolutionUtils; +import org.apache.flink.cdc.connectors.maxcompute.utils.TypeConvertUtils; + +import com.aliyun.odps.OdpsException; +import com.aliyun.odps.TableSchema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** A {@link MetadataApplier} for "MaxCompute" connector. */ +public class MaxComputeMetadataApplier implements MetadataApplier { + private static final long serialVersionUID = 1L; + private static final Logger LOG = LoggerFactory.getLogger(MaxComputeMetadataApplier.class); + + private final MaxComputeOptions maxComputeOptions; + + public MaxComputeMetadataApplier(MaxComputeOptions maxComputeOptions) { + this.maxComputeOptions = maxComputeOptions; + } + + @Override + public void applySchemaChange(SchemaChangeEvent schemaChangeEvent) { + LOG.info("MaxCompute apply schema change event: {}", schemaChangeEvent); + try { + if (schemaChangeEvent instanceof CreateTableEvent) { + CreateTableEvent createTableEvent = (CreateTableEvent) schemaChangeEvent; + if (MaxComputeUtils.isTableExist(maxComputeOptions, createTableEvent.tableId())) { + TableSchema currentSchema = + MaxComputeUtils.getTableSchema( + maxComputeOptions, createTableEvent.tableId()); + TableSchema expectSchema = + TypeConvertUtils.toMaxCompute(createTableEvent.getSchema()); + if (!MaxComputeUtils.schemaEquals(currentSchema, expectSchema)) { + throw new IllegalStateException( + "The schema of create table event is not equals to exist table schema, please drop/rename exist table before flink cdc task start."); + } + } else { + SchemaEvolutionUtils.createTable( + maxComputeOptions, + createTableEvent.tableId(), + createTableEvent.getSchema()); + } + } else if (schemaChangeEvent instanceof AlterColumnTypeEvent) { + AlterColumnTypeEvent alterColumnTypeEvent = + (AlterColumnTypeEvent) schemaChangeEvent; + SchemaEvolutionUtils.alterColumnType( + maxComputeOptions, + alterColumnTypeEvent.tableId(), + alterColumnTypeEvent.getTypeMapping()); + } else if (schemaChangeEvent instanceof DropColumnEvent) { + DropColumnEvent dropColumnEvent = (DropColumnEvent) schemaChangeEvent; + SchemaEvolutionUtils.dropColumn( + maxComputeOptions, + dropColumnEvent.tableId(), + dropColumnEvent.getDroppedColumnNames()); + } else if (schemaChangeEvent instanceof RenameColumnEvent) { + RenameColumnEvent renameColumnEvent = (RenameColumnEvent) schemaChangeEvent; + SchemaEvolutionUtils.renameColumn( + maxComputeOptions, + renameColumnEvent.tableId(), + renameColumnEvent.getNameMapping()); + } else if (schemaChangeEvent instanceof AddColumnEvent) { + AddColumnEvent addColumnEvent = (AddColumnEvent) schemaChangeEvent; + SchemaEvolutionUtils.addColumns( + maxComputeOptions, + addColumnEvent.tableId(), + addColumnEvent.getAddedColumns()); + } else { + throw new UnsupportedOperationException( + "Unsupported schema change event: " + + schemaChangeEvent.getClass().getName()); + } + } catch (OdpsException e) { + throw new UncheckedOdpsException(e); + } + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/common/Constant.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/common/Constant.java new file mode 100644 index 0000000000..ed85024af7 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/common/Constant.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.common; + +/** Constant use for MaxCompute Connector. */ +public class Constant { + public static final String TUNNEL_SESSION_ID = "tunnel_session_id"; + public static final String MAXCOMPUTE_PARTITION_NAME = "maxcompute_partition_name"; + public static final String SCHEMA_ENABLE_FLAG = "odps.schema.model.enabled"; + + public static final String PIPELINE_SESSION_MANAGE_OPERATOR_UID = + "$$_session_manage_operator_$$"; + + public static final String END_OF_SESSION = "end_of_session"; +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/common/FlinkOdpsException.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/common/FlinkOdpsException.java new file mode 100755 index 0000000000..61e1041b83 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/common/FlinkOdpsException.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.common; + +/** Exception thrown by Flink MaxCompute Connector. */ +public class FlinkOdpsException extends RuntimeException { + private static final long serialVersionUID = 1L; + + public FlinkOdpsException(Throwable cause) { + super(cause); + } + + public FlinkOdpsException(String message) { + super(message); + } + + public FlinkOdpsException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/common/SessionIdentifier.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/common/SessionIdentifier.java new file mode 100644 index 0000000000..4cdab849cc --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/common/SessionIdentifier.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.common; + +import org.apache.flink.cdc.common.event.TableId; + +import java.io.Serializable; +import java.util.Objects; + +/** + * A Session is uniquely identified through {@link TableId} and {@link String partitionName}. When + * the Session is successfully created, this class can also carry the sessionId. Note that sessionId + * does not participate in the comparison of hashcode and equals. + */ +public class SessionIdentifier implements Serializable { + private static final long serialVersionUID = 1L; + private final String project; + private final String schema; + private final String table; + private final String partitionName; + + /** sessionId not calculate in hashcode and equals. */ + private String sessionId; + + public SessionIdentifier(String project, String schema, String table, String partitionName) { + this(project, schema, table, partitionName, null); + } + + public SessionIdentifier( + String project, String schema, String table, String partitionName, String sessionId) { + this.project = project; + this.schema = schema; + this.table = table; + this.partitionName = partitionName; + this.sessionId = sessionId; + } + + public static SessionIdentifier of( + String project, String schema, String table, String partitionName) { + return new SessionIdentifier(project, schema, table, partitionName); + } + + public static SessionIdentifier of( + String project, String schema, String table, String partitionName, String sessionId) { + return new SessionIdentifier(project, schema, table, partitionName, sessionId); + } + + public String getProject() { + return project; + } + + public String getSchema() { + return schema; + } + + public String getTable() { + return table; + } + + public String getPartitionName() { + return partitionName; + } + + public String getSessionId() { + return sessionId; + } + + @Override + public String toString() { + return "SessionIdentifier{" + + "project='" + + project + + '\'' + + ", schema='" + + schema + + '\'' + + ", table='" + + table + + '\'' + + ", partitionName='" + + partitionName + + '\'' + + ", sessionId='" + + sessionId + + '\'' + + '}'; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + SessionIdentifier that = (SessionIdentifier) o; + return Objects.equals(project, that.project) + && Objects.equals(schema, that.schema) + && Objects.equals(table, that.table) + && Objects.equals(partitionName, that.partitionName); + } + + @Override + public int hashCode() { + return Objects.hash(project, schema, table, partitionName); + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/common/UncheckedOdpsException.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/common/UncheckedOdpsException.java new file mode 100644 index 0000000000..a4eedfbd9c --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/common/UncheckedOdpsException.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.common; + +import com.aliyun.odps.OdpsException; + +/** a wrapper class for {@link OdpsException} which throws {@link RuntimeException}. */ +public class UncheckedOdpsException extends RuntimeException { + private static final long serialVersionUID = 1L; + private final OdpsException cause; + + public UncheckedOdpsException(OdpsException cause) { + super(cause); + this.cause = cause; + } + + @Override + public OdpsException getCause() { + return cause; + } + + @Override + public String getMessage() { + return cause.getMessage() + ", requestId: " + cause.getRequestId(); + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/SessionManageCoordinatedOperatorFactory.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/SessionManageCoordinatedOperatorFactory.java new file mode 100644 index 0000000000..870983bdcf --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/SessionManageCoordinatedOperatorFactory.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.coordinator; + +import org.apache.flink.cdc.common.event.Event; +import org.apache.flink.cdc.composer.flink.coordination.OperatorIDGenerator; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeWriteOptions; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator.Provider; +import org.apache.flink.streaming.api.operators.AbstractStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.CoordinatedOperatorFactory; +import org.apache.flink.streaming.api.operators.OneInputStreamOperatorFactory; +import org.apache.flink.streaming.api.operators.StreamOperator; +import org.apache.flink.streaming.api.operators.StreamOperatorParameters; + +/** The {@link AbstractStreamOperatorFactory} for {@link SessionManageOperator}. */ +public class SessionManageCoordinatedOperatorFactory extends AbstractStreamOperatorFactory + implements CoordinatedOperatorFactory, OneInputStreamOperatorFactory { + private static final long serialVersionUID = 1L; + private final MaxComputeOptions options; + private final MaxComputeWriteOptions writeOptions; + private final String schemaOperatorUid; + + public SessionManageCoordinatedOperatorFactory( + MaxComputeOptions options, + MaxComputeWriteOptions writeOptions, + String schemaOperatorUid) { + this.options = options; + this.writeOptions = writeOptions; + this.schemaOperatorUid = schemaOperatorUid; + } + + @Override + public > T createStreamOperator( + StreamOperatorParameters parameters) { + OperatorIDGenerator schemaOperatorIdGenerator = new OperatorIDGenerator(schemaOperatorUid); + SessionManageOperator operator = + new SessionManageOperator(options, schemaOperatorIdGenerator.generate()); + TaskOperatorEventGateway taskOperatorEventGateway = + parameters + .getContainingTask() + .getEnvironment() + .getOperatorCoordinatorEventGateway(); + operator.setup( + parameters.getContainingTask(), + parameters.getStreamConfig(), + parameters.getOutput()); + operator.setTaskOperatorEventGateway(taskOperatorEventGateway); + parameters + .getOperatorEventDispatcher() + .registerEventHandler(operator.getOperatorID(), operator); + return (T) operator; + } + + @Override + public Class getStreamOperatorClass(ClassLoader classLoader) { + return SessionManageOperator.class; + } + + @Override + public Provider getCoordinatorProvider(String operatorName, OperatorID operatorID) { + return new SessionManageCoordinator.SessionManageCoordinatorProvider( + operatorName, operatorID, options, writeOptions); + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/SessionManageCoordinator.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/SessionManageCoordinator.java new file mode 100644 index 0000000000..cd62ac5096 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/SessionManageCoordinator.java @@ -0,0 +1,302 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.coordinator; + +import org.apache.flink.cdc.common.utils.StringUtils; +import org.apache.flink.cdc.connectors.maxcompute.common.Constant; +import org.apache.flink.cdc.connectors.maxcompute.common.FlinkOdpsException; +import org.apache.flink.cdc.connectors.maxcompute.common.SessionIdentifier; +import org.apache.flink.cdc.connectors.maxcompute.coordinator.message.CommitSessionRequest; +import org.apache.flink.cdc.connectors.maxcompute.coordinator.message.CreateSessionRequest; +import org.apache.flink.cdc.connectors.maxcompute.coordinator.message.CreateSessionResponse; +import org.apache.flink.cdc.connectors.maxcompute.coordinator.message.WaitForFlushSuccessRequest; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeWriteOptions; +import org.apache.flink.cdc.connectors.maxcompute.utils.MaxComputeUtils; +import org.apache.flink.cdc.connectors.maxcompute.utils.RetryUtils; +import org.apache.flink.cdc.connectors.maxcompute.utils.SessionCommitCoordinateHelper; +import org.apache.flink.cdc.connectors.maxcompute.writer.MaxComputeWriter; +import org.apache.flink.cdc.runtime.operators.schema.event.CoordinationResponseUtils; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.operators.coordination.CoordinationRequest; +import org.apache.flink.runtime.operators.coordination.CoordinationRequestHandler; +import org.apache.flink.runtime.operators.coordination.CoordinationResponse; +import org.apache.flink.runtime.operators.coordination.OperatorCoordinator; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.util.ExceptionUtils; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.CompletionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicBoolean; + +/** + * An OperatorCoordinator is used to manage the Session and is consistent with accepting {@link + * CreateSessionRequest} and {@link CommitSessionRequest} sent by the Operator. + */ +public class SessionManageCoordinator implements OperatorCoordinator, CoordinationRequestHandler { + + private static final Logger LOG = LoggerFactory.getLogger(SessionManageCoordinator.class); + private final String operatorName; + private final MaxComputeOptions options; + private final MaxComputeWriteOptions writeOptions; + private final int parallelism; + private SessionCommitCoordinateHelper sessionCommitCoordinateHelper; + private Map sessionCache; + private Map sessionIdMap; + private CompletableFuture[] waitingFlushFutures; + private ExecutorService executor; + + private SessionManageCoordinator( + String operatorName, + Context context, + MaxComputeOptions options, + MaxComputeWriteOptions writeOptions) { + this.operatorName = operatorName; + this.parallelism = context.currentParallelism(); + this.options = options; + this.writeOptions = writeOptions; + } + + @Override + public void start() { + LOG.info("Starting SessionManageCoordinator {}.", operatorName); + + this.sessionCache = new HashMap<>(); + this.sessionIdMap = new HashMap<>(); + // start the executor + this.executor = Executors.newFixedThreadPool(writeOptions.getNumCommitThread()); + + this.waitingFlushFutures = new CompletableFuture[parallelism]; + this.sessionCommitCoordinateHelper = new SessionCommitCoordinateHelper(parallelism); + } + + @Override + public void close() throws Exception { + if (this.executor != null) { + this.executor.shutdown(); + } + } + + @Override + public void handleEventFromOperator(int subtask, int attemptNumber, OperatorEvent event) { + // nothing to do + } + + private MaxComputeWriter createWriter(SessionIdentifier identifier) { + String partitionName = identifier.getPartitionName(); + if (!StringUtils.isNullOrWhitespaceOnly(partitionName)) { + RetryUtils.executeUnchecked( + () -> { + MaxComputeUtils.createPartitionIfAbsent( + options, + identifier.getSchema(), + identifier.getTable(), + partitionName); + return null; + }); + } + try { + MaxComputeWriter writer = + MaxComputeWriter.batchWriter(options, writeOptions, identifier); + LOG.info("Create session for table {}, sessionId {}.", identifier, writer.getId()); + return writer; + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + @Override + public void checkpointCoordinator(long checkpointId, CompletableFuture result) { + executor.execute( + () -> { + try { + result.complete(new byte[0]); + } catch (Throwable throwable) { + ExceptionUtils.rethrowIfFatalErrorOrOOM(throwable); + // when a checkpoint fails, throws directly. + result.completeExceptionally( + new CompletionException( + String.format( + "Failed to checkpoint Session %s for source %s", + checkpointId, this.getClass().getSimpleName()), + throwable)); + } + }); + } + + @Override + public CompletableFuture handleCoordinationRequest( + CoordinationRequest request) { + LOG.info("Received coordination request {}.", request); + if (request instanceof CommitSessionRequest) { + CommitSessionRequest commitSessionRequest = (CommitSessionRequest) request; + + CompletableFuture future = + sessionCommitCoordinateHelper.commit( + commitSessionRequest.getOperatorIndex(), + commitSessionRequest.getSessionId()); + String toSubmitSessionId = sessionCommitCoordinateHelper.getToCommitSessionId(); + while (sessionCommitCoordinateHelper.isCommitting() && toSubmitSessionId != null) { + commitSession(toSubmitSessionId); + toSubmitSessionId = sessionCommitCoordinateHelper.getToCommitSessionId(); + } + if (!sessionCommitCoordinateHelper.isCommitting()) { + sessionCommitCoordinateHelper.commitSuccess(Constant.END_OF_SESSION, true); + sessionCommitCoordinateHelper.clear(); + + if (!sessionCache.isEmpty()) { + throw new FlinkOdpsException( + "sessionCache not empty: " + sessionCache.keySet()); + } + completeAllFlushFutures(); + } + return future; + } else if (request instanceof WaitForFlushSuccessRequest) { + CompletableFuture waitingFlushFuture = new CompletableFuture<>(); + waitingFlushFutures[((WaitForFlushSuccessRequest) request).getOperatorIndex()] = + waitingFlushFuture; + return waitingFlushFuture; + } else if (request instanceof CreateSessionRequest) { + SessionIdentifier sessionIdentifier = ((CreateSessionRequest) request).getIdentifier(); + if (!sessionCache.containsKey(sessionIdentifier)) { + MaxComputeWriter writer = createWriter(sessionIdentifier); + sessionCache.put(sessionIdentifier, writer); + sessionIdMap.put(writer.getId(), sessionIdentifier); + } + return CompletableFuture.completedFuture( + CoordinationResponseUtils.wrap( + new CreateSessionResponse( + sessionCache.get(sessionIdentifier).getId()))); + } else { + return CompletableFuture.completedFuture(null); + } + } + + private void commitSession(String toSubmitSessionId) { + MaxComputeWriter writer = sessionCache.remove(sessionIdMap.remove(toSubmitSessionId)); + AtomicBoolean isSuccess = new AtomicBoolean(true); + LOG.info("start commit writer {}.", toSubmitSessionId); + try { + Future future = + executor.submit( + () -> { + try { + writer.commit(); + } catch (Throwable throwable) { + ExceptionUtils.rethrowIfFatalErrorOrOOM(throwable); + LOG.warn( + "Failed to commit writer {}.", + writer.getId(), + throwable); + isSuccess.set(false); + } + }); + future.get(); + } catch (Exception e) { + isSuccess.set(false); + } + sessionCommitCoordinateHelper.commitSuccess(toSubmitSessionId, isSuccess.get()); + } + + private void completeAllFlushFutures() { + for (CompletableFuture waitingFlushFuture : waitingFlushFutures) { + waitingFlushFuture.complete(null); + } + Arrays.fill(waitingFlushFutures, null); + } + + @Override + public void notifyCheckpointComplete(long checkpointId) { + // nothing to do + } + + @Override + public void resetToCheckpoint(long checkpointId, @Nullable byte[] checkpointData) + throws Exception { + // nothing to do + } + + @Override + public void subtaskReset(int subtask, long checkpointId) { + // nothing to do + } + + @Override + public void executionAttemptFailed(int subtask, int attemptNumber, @Nullable Throwable reason) { + // nothing to do + } + + @Override + public void executionAttemptReady(int subtask, int attemptNumber, SubtaskGateway gateway) { + // nothing to do + } + + /** + * The {@link org.apache.flink.runtime.operators.coordination.OperatorCoordinator.Provider} of + * {@link SessionManageCoordinator}. + */ + public static class SessionManageCoordinatorProvider implements Provider { + + private final OperatorID operatorID; + private final String operatorName; + private final MaxComputeOptions options; + private final MaxComputeWriteOptions writeOptions; + + public SessionManageCoordinatorProvider( + String operatorName, + OperatorID operatorID, + MaxComputeOptions options, + MaxComputeWriteOptions writeOptions) { + this.operatorName = operatorName; + this.operatorID = operatorID; + + this.options = options; + this.writeOptions = writeOptions; + } + + /** Gets the ID of the operator to which the coordinator belongs. */ + @Override + public OperatorID getOperatorId() { + return operatorID; + } + + /** + * Creates the {@code OperatorCoordinator}, using the given context. + * + * @param context + */ + @Override + public OperatorCoordinator create(Context context) throws Exception { + return new SessionManageCoordinator(operatorName, context, options, writeOptions); + } + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/SessionManageOperator.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/SessionManageOperator.java new file mode 100644 index 0000000000..be481d93ef --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/SessionManageOperator.java @@ -0,0 +1,297 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.coordinator; + +import org.apache.flink.cdc.common.data.RecordData; +import org.apache.flink.cdc.common.event.CreateTableEvent; +import org.apache.flink.cdc.common.event.DataChangeEvent; +import org.apache.flink.cdc.common.event.Event; +import org.apache.flink.cdc.common.event.FlushEvent; +import org.apache.flink.cdc.common.event.OperationType; +import org.apache.flink.cdc.common.event.SchemaChangeEvent; +import org.apache.flink.cdc.common.event.TableId; +import org.apache.flink.cdc.common.schema.Schema; +import org.apache.flink.cdc.common.utils.SchemaUtils; +import org.apache.flink.cdc.connectors.maxcompute.common.Constant; +import org.apache.flink.cdc.connectors.maxcompute.common.SessionIdentifier; +import org.apache.flink.cdc.connectors.maxcompute.coordinator.message.CreateSessionRequest; +import org.apache.flink.cdc.connectors.maxcompute.coordinator.message.CreateSessionResponse; +import org.apache.flink.cdc.connectors.maxcompute.coordinator.message.WaitForFlushSuccessRequest; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.utils.MaxComputeUtils; +import org.apache.flink.cdc.connectors.maxcompute.utils.TypeConvertUtils; +import org.apache.flink.cdc.runtime.operators.schema.event.CoordinationResponseUtils; +import org.apache.flink.cdc.runtime.operators.sink.SchemaEvolutionClient; +import org.apache.flink.runtime.jobgraph.OperatorID; +import org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway; +import org.apache.flink.runtime.operators.coordination.CoordinationRequest; +import org.apache.flink.runtime.operators.coordination.CoordinationResponse; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; +import org.apache.flink.runtime.operators.coordination.OperatorEventHandler; +import org.apache.flink.runtime.state.StateSnapshotContext; +import org.apache.flink.streaming.api.graph.StreamConfig; +import org.apache.flink.streaming.api.operators.AbstractStreamOperator; +import org.apache.flink.streaming.api.operators.BoundedOneInput; +import org.apache.flink.streaming.api.operators.ChainingStrategy; +import org.apache.flink.streaming.api.operators.OneInputStreamOperator; +import org.apache.flink.streaming.api.operators.Output; +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord; +import org.apache.flink.streaming.runtime.tasks.StreamTask; +import org.apache.flink.util.SerializedValue; + +import com.aliyun.odps.PartitionSpec; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; + +/** + * Processes a {@link DataChangeEvent}, extracting data and encapsulating it into a {@link + * SessionIdentifier}, and then sends a {@link CreateSessionRequest} to the {@link + * SessionManageCoordinator} to create a writing session. Subsequently, it incorporates the + * SessionId into the metadata of the {@link DataChangeEvent} for downstream processing. + */ +public class SessionManageOperator extends AbstractStreamOperator + implements OneInputStreamOperator, OperatorEventHandler, BoundedOneInput { + private static final long serialVersionUID = 1L; + private static final Logger LOG = LoggerFactory.getLogger(SessionManageOperator.class); + + /** a tricky way to get an Operator from sink. */ + public static SessionManageOperator instance; + + private final MaxComputeOptions options; + private final OperatorID schemaOperatorUid; + + private transient TaskOperatorEventGateway taskOperatorEventGateway; + private transient Map sessionCache; + private transient Map schemaMaps; + private transient Map> fieldGetterMaps; + private transient SchemaEvolutionClient schemaEvolutionClient; + + private transient Future snapshotFlushSuccess; + private transient int indexOfThisSubtask; + /** + * trigger endOfInput is ahead of prepareSnapshotPreBarrier, so we need this flag to handle when + * endOfInput, send WaitForSuccessRequest in advance. + */ + private transient boolean endOfInput; + + public SessionManageOperator(MaxComputeOptions options, OperatorID schemaOperatorUid) { + this.chainingStrategy = ChainingStrategy.ALWAYS; + this.options = options; + this.schemaOperatorUid = schemaOperatorUid; + } + + @Override + public void open() throws Exception { + this.sessionCache = new HashMap<>(); + this.schemaMaps = new HashMap<>(); + this.fieldGetterMaps = new HashMap<>(); + SessionManageOperator.instance = this; + } + + @Override + public void setup( + StreamTask containingTask, + StreamConfig config, + Output> output) { + super.setup(containingTask, config, output); + schemaEvolutionClient = + new SchemaEvolutionClient( + containingTask.getEnvironment().getOperatorCoordinatorEventGateway(), + schemaOperatorUid); + indexOfThisSubtask = getRuntimeContext().getIndexOfThisSubtask(); + } + + @Override + public void processElement(StreamRecord element) throws Exception { + if (element.getValue() instanceof DataChangeEvent) { + DataChangeEvent dataChangeEvent = (DataChangeEvent) element.getValue(); + TableId tableId = dataChangeEvent.tableId(); + // because of this operator is between SchemaOperator and DataSinkWriterOperator, no + // schema will fill when CreateTableEvent is loss. + if (!schemaMaps.containsKey(tableId)) { + emitLatestSchema(tableId); + } + String partitionName = + extractPartition( + dataChangeEvent.op() == OperationType.DELETE + ? dataChangeEvent.before() + : dataChangeEvent.after(), + tableId); + SessionIdentifier sessionIdentifier = + SessionIdentifier.of( + options.getProject(), + MaxComputeUtils.getSchema(options, tableId), + tableId.getTableName(), + partitionName); + if (!sessionCache.containsKey(sessionIdentifier)) { + CreateSessionResponse response = + (CreateSessionResponse) + sendRequestToOperator(new CreateSessionRequest(sessionIdentifier)); + sessionCache.put(sessionIdentifier, response.getSessionId()); + } + dataChangeEvent + .meta() + .put(Constant.TUNNEL_SESSION_ID, sessionCache.get(sessionIdentifier)); + dataChangeEvent.meta().put(Constant.MAXCOMPUTE_PARTITION_NAME, partitionName); + output.collect(new StreamRecord<>(dataChangeEvent)); + } else if (element.getValue() instanceof FlushEvent) { + LOG.info( + "operator {} handle FlushEvent begin, wait for sink writers flush success", + indexOfThisSubtask); + sessionCache.clear(); + Future waitForSuccess = + submitRequestToOperator(new WaitForFlushSuccessRequest(indexOfThisSubtask)); + output.collect(element); + // wait for sink writers flush success + waitForSuccess.get(); + LOG.info( + "operator {} handle FlushEvent end, all sink writers flush success", + indexOfThisSubtask); + } else if (element.getValue() instanceof CreateTableEvent) { + TableId tableId = ((CreateTableEvent) element.getValue()).tableId(); + Schema schema = ((CreateTableEvent) element.getValue()).getSchema(); + schemaMaps.put(tableId, schema); + fieldGetterMaps.put(tableId, TypeConvertUtils.createFieldGetters(schema)); + output.collect(element); + } else if (element.getValue() instanceof SchemaChangeEvent) { + SchemaChangeEvent schemaChangeEvent = (SchemaChangeEvent) element.getValue(); + TableId tableId = schemaChangeEvent.tableId(); + Schema newSchema = + SchemaUtils.applySchemaChangeEvent(schemaMaps.get(tableId), schemaChangeEvent); + schemaMaps.put(tableId, newSchema); + fieldGetterMaps.put(tableId, TypeConvertUtils.createFieldGetters(newSchema)); + output.collect(element); + } else { + output.collect(element); + LOG.warn("unknown element {}", element.getValue()); + } + } + + private void emitLatestSchema(TableId tableId) throws Exception { + Optional schema = schemaEvolutionClient.getLatestSchema(tableId); + if (schema.isPresent()) { + Schema latestSchema = schema.get(); + schemaMaps.put(tableId, latestSchema); + fieldGetterMaps.put(tableId, TypeConvertUtils.createFieldGetters(latestSchema)); + output.collect(new StreamRecord<>(new CreateTableEvent(tableId, latestSchema))); + } else { + throw new RuntimeException( + "Could not find schema message from SchemaRegistry for " + tableId); + } + } + + @Override + public void prepareSnapshotPreBarrier(long checkpointId) throws Exception { + super.prepareSnapshotPreBarrier(checkpointId); + if (endOfInput) { + return; + } + LOG.info( + "operator {} prepare snapshot, wait for sink writers flush success", + indexOfThisSubtask); + // wait for sink writers flush success + waitLastSnapshotFlushSuccess(); + snapshotFlushSuccess = + submitRequestToOperator( + new WaitForFlushSuccessRequest( + getRuntimeContext().getIndexOfThisSubtask())); + } + + @Override + public void snapshotState(StateSnapshotContext context) throws Exception { + super.snapshotState(context); + sessionCache.clear(); + waitLastSnapshotFlushSuccess(); + LOG.info("operator {} snapshot end, all sink writers flush success", indexOfThisSubtask); + } + + @Override + public void endInput() throws Exception { + this.endOfInput = true; + LOG.info( + "operator {} end of input, wait for sink writers flush success", + indexOfThisSubtask); + waitLastSnapshotFlushSuccess(); + snapshotFlushSuccess = + submitRequestToOperator( + new WaitForFlushSuccessRequest( + getRuntimeContext().getIndexOfThisSubtask())); + } + + private void waitLastSnapshotFlushSuccess() throws Exception { + if (snapshotFlushSuccess != null) { + snapshotFlushSuccess.get(); + snapshotFlushSuccess = null; + } + } + + /** partition column is always after data column. */ + private String extractPartition(RecordData recordData, TableId tableId) { + Schema schema = schemaMaps.get(tableId); + int partitionKeyCount = schema.partitionKeys().size(); + if (partitionKeyCount == 0) { + return null; + } + int columnCount = schema.getColumnCount(); + List fieldGetters = fieldGetterMaps.get(tableId); + + PartitionSpec partitionSpec = new PartitionSpec(); + for (int i = 0; i < partitionKeyCount; i++) { + RecordData.FieldGetter fieldGetter = + fieldGetters.get(columnCount - partitionKeyCount - 1 + i); + Object value = fieldGetter.getFieldOrNull(recordData); + partitionSpec.set(schema.partitionKeys().get(i), Objects.toString(value)); + } + return partitionSpec.toString(true, true); + } + + @Override + public void handleOperatorEvent(OperatorEvent evt) { + // handle event + } + + /** call from CreateSessionCoordinatedOperatorFactory. */ + public void setTaskOperatorEventGateway(TaskOperatorEventGateway taskOperatorEventGateway) { + this.taskOperatorEventGateway = taskOperatorEventGateway; + } + + public CoordinationResponse sendRequestToOperator(CoordinationRequest request) + throws IOException, ExecutionException, InterruptedException { + CompletableFuture responseFuture = + taskOperatorEventGateway.sendRequestToCoordinator( + getOperatorID(), new SerializedValue<>(request)); + return CoordinationResponseUtils.unwrap(responseFuture.get()); + } + + public Future submitRequestToOperator(CoordinationRequest request) + throws IOException { + return taskOperatorEventGateway.sendRequestToCoordinator( + getOperatorID(), new SerializedValue<>(request)); + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/CommitSessionRequest.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/CommitSessionRequest.java new file mode 100644 index 0000000000..c12ed2049e --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/CommitSessionRequest.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.coordinator.message; + +import org.apache.flink.runtime.operators.coordination.CoordinationRequest; + +/** + * The commit session request from {@link + * org.apache.flink.cdc.connectors.maxcompute.sink.MaxComputeEventWriter} to {@link + * org.apache.flink.cdc.connectors.maxcompute.coordinator.SessionManageCoordinator}. Which is a type + * of {@link SyncRequest}. + */ +public class CommitSessionRequest implements CoordinationRequest { + private static final long serialVersionUID = 1L; + + private final int operatorIndex; + private final String sessionId; + + public CommitSessionRequest(int operatorIndex, String sessionId) { + this.operatorIndex = operatorIndex; + this.sessionId = sessionId; + } + + public int getOperatorIndex() { + return operatorIndex; + } + + public String getSessionId() { + return sessionId; + } + + @Override + public String toString() { + return "CommitSessionRequest{" + + "operatorIndex=" + + operatorIndex + + ", sessionId='" + + sessionId + + '\'' + + '}'; + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/CommitSessionResponse.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/CommitSessionResponse.java new file mode 100644 index 0000000000..105dca3bc7 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/CommitSessionResponse.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.coordinator.message; + +import org.apache.flink.runtime.operators.coordination.CoordinationResponse; + +/** + * Response for a {@link CommitSessionRequest}. This response is sent from {@link + * org.apache.flink.cdc.connectors.maxcompute.coordinator.SessionManageCoordinator} to {@link + * org.apache.flink.cdc.connectors.maxcompute.sink.MaxComputeEventWriter}. + * + *

A successful response indicates that all sessions have been committed, allowing the writer to + * proceed to the next round of writing. Otherwise, if any session has not been successfully + * committed, all task managers are instructed to reset to the latest checkpoint in order to retry + * the operation. + */ +public class CommitSessionResponse implements CoordinationResponse { + private static final long serialVersionUID = 1L; + + private final boolean success; + + public CommitSessionResponse(boolean success) { + this.success = success; + } + + public boolean isSuccess() { + return success; + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/CreateSessionRequest.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/CreateSessionRequest.java new file mode 100644 index 0000000000..fe8dac2972 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/CreateSessionRequest.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.coordinator.message; + +import org.apache.flink.cdc.connectors.maxcompute.common.SessionIdentifier; +import org.apache.flink.runtime.operators.coordination.CoordinationRequest; + +/** + * Represents a request sent from a {@link + * org.apache.flink.cdc.connectors.maxcompute.coordinator.SessionManageOperator} to the {@link + * org.apache.flink.cdc.connectors.maxcompute.coordinator.SessionManageCoordinator}. + * + *

When a {@link org.apache.flink.cdc.common.event.DataChangeEvent} indicates a new session, a + * {@link CreateSessionRequest} is sent to the coordinator to handle the session creation process. + * + *

use {@link SessionIdentifier} to identify the session, + */ +public class CreateSessionRequest implements CoordinationRequest { + private static final long serialVersionUID = 1L; + + private SessionIdentifier identifier; + + public CreateSessionRequest(SessionIdentifier identifier) { + this.identifier = identifier; + } + + public SessionIdentifier getIdentifier() { + return identifier; + } + + public void setIdentifier(SessionIdentifier identifier) { + this.identifier = identifier; + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/CreateSessionResponse.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/CreateSessionResponse.java new file mode 100644 index 0000000000..c650d0240a --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/CreateSessionResponse.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.coordinator.message; + +import org.apache.flink.cdc.connectors.maxcompute.common.SessionIdentifier; +import org.apache.flink.runtime.operators.coordination.CoordinationResponse; + +/** + * the response of {@link CreateSessionRequest}, which contains the sessionId of specific {@link + * SessionIdentifier}. + */ +public class CreateSessionResponse implements CoordinationResponse { + + private final String sessionId; + + public CreateSessionResponse(String sessionId) { + this.sessionId = sessionId; + } + + public String getSessionId() { + return sessionId; + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/SyncRequest.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/SyncRequest.java new file mode 100644 index 0000000000..02ba9f915c --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/SyncRequest.java @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.coordinator.message; + +import org.apache.flink.runtime.operators.coordination.CoordinationRequest; + +/** + * A request signaling the {@link + * org.apache.flink.cdc.connectors.maxcompute.coordinator.SessionManageCoordinator} to await + * requests from all operators. Upon receiving this request from every operator, the coordinator + * proceeds to send a response. + */ +public class SyncRequest implements CoordinationRequest { + private static final long serialVersionUID = 1L; + + private final int operatorIndex; + + public SyncRequest(int operatorIndex) { + this.operatorIndex = operatorIndex; + } + + public int getOperatorIndex() { + return operatorIndex; + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/WaitForFlushSuccessRequest.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/WaitForFlushSuccessRequest.java new file mode 100644 index 0000000000..8a8baec6e7 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/coordinator/message/WaitForFlushSuccessRequest.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.coordinator.message; + +import org.apache.flink.runtime.operators.coordination.CoordinationRequest; +import org.apache.flink.runtime.operators.coordination.OperatorEvent; + +/** the {@link OperatorEvent} use for sync SessionManageOperator and SinkWriter. */ +public class WaitForFlushSuccessRequest implements CoordinationRequest { + private static final long serialVersionUID = 1L; + + private final int operatorIndex; + + public WaitForFlushSuccessRequest(int operatorIndex) { + this.operatorIndex = operatorIndex; + } + + public int getOperatorIndex() { + return operatorIndex; + } + + @Override + public String toString() { + return "WaitForFlushSuccessRequest{" + "operatorIndex=" + operatorIndex + '}'; + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/options/MaxComputeOptions.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/options/MaxComputeOptions.java new file mode 100644 index 0000000000..6fb74bbe09 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/options/MaxComputeOptions.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.options; + +import org.apache.flink.cdc.connectors.maxcompute.utils.MaxComputeUtils; + +import java.io.Serializable; + +/** basic options for MaxCompute. */ +public class MaxComputeOptions implements Serializable { + private static final long serialVersionUID = 1L; + + private final String accessId; + private final String accessKey; + private final String endpoint; + private final String project; + private final String tunnelEndpoint; + private final boolean supportSchema; + private final String quotaName; + private final String stsToken; + private final int bucketSize; + private final String schemaOperatorUid; + + private MaxComputeOptions(Builder builder) { + this.accessId = builder.accessId; + this.accessKey = builder.accessKey; + this.endpoint = builder.endpoint; + this.project = builder.project; + this.tunnelEndpoint = builder.tunnelEndpoint; + this.quotaName = builder.quotaName; + this.stsToken = builder.stsToken; + this.bucketSize = builder.bucketSize; + this.supportSchema = MaxComputeUtils.supportSchema(this); + this.schemaOperatorUid = builder.schemaOperatorUid; + } + + public static Builder builder( + String accessId, String accessKey, String endpoint, String project) { + return new Builder(accessId, accessKey, endpoint, project); + } + + public String getTunnelEndpoint() { + return tunnelEndpoint; + } + + public String getAccessId() { + return accessId; + } + + public String getAccessKey() { + return accessKey; + } + + public String getEndpoint() { + return endpoint; + } + + public String getProject() { + return project; + } + + public String getQuotaName() { + return quotaName; + } + + public String getStsToken() { + return stsToken; + } + + public boolean isSupportSchema() { + return supportSchema; + } + + public int getBucketSize() { + return bucketSize; + } + + public String getSchemaOperatorUid() { + return schemaOperatorUid; + } + + /** builder for maxcompute options. */ + public static class Builder { + + private final String accessId; + private final String accessKey; + private final String endpoint; + private final String project; + private String tunnelEndpoint; + private String quotaName; + private String stsToken; + private String schemaOperatorUid; + private int bucketSize = 16; + + public Builder(String accessId, String accessKey, String endpoint, String project) { + this.accessId = accessId; + this.accessKey = accessKey; + this.endpoint = endpoint; + this.project = project; + } + + public Builder withTunnelEndpoint(String tunnelEndpoint) { + this.tunnelEndpoint = tunnelEndpoint; + return this; + } + + public Builder withQuotaName(String quotaName) { + this.quotaName = quotaName; + return this; + } + + public Builder withStsToken(String stsToken) { + this.stsToken = stsToken; + return this; + } + + public Builder withBucketSize(int bucketSize) { + this.bucketSize = bucketSize; + return this; + } + + public Builder withSchemaOperatorUid(String schemaOperatorUid) { + this.schemaOperatorUid = schemaOperatorUid; + return this; + } + + public MaxComputeOptions build() { + return new MaxComputeOptions(this); + } + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/options/MaxComputeWriteOptions.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/options/MaxComputeWriteOptions.java new file mode 100644 index 0000000000..c746063e6b --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/options/MaxComputeWriteOptions.java @@ -0,0 +1,101 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.options; + +import java.io.Serializable; + +/** extended options for maxcompute. */ +public class MaxComputeWriteOptions implements Serializable { + private static final long serialVersionUID = 1L; + private final int flushConcurrent; + private final long maxBufferSize; + private final long slotBufferSize; + private final int numCommitThread; + private final String compressAlgorithm; + + private MaxComputeWriteOptions(Builder builder) { + this.flushConcurrent = builder.flushConcurrent; + this.maxBufferSize = builder.maxBufferSize; + this.slotBufferSize = builder.slotBufferSize; + this.numCommitThread = builder.numCommitThread; + this.compressAlgorithm = builder.compressAlgorithm; + } + + public static Builder builder() { + return new Builder(); + } + + public int getFlushConcurrent() { + return flushConcurrent; + } + + public long getMaxBufferSize() { + return maxBufferSize; + } + + public long getSlotBufferSize() { + return slotBufferSize; + } + + public int getNumCommitThread() { + return numCommitThread; + } + + public String getCompressAlgorithm() { + return compressAlgorithm; + } + + /** builder for maxcompute write options. */ + public static class Builder { + private int flushConcurrent = 2; + private long maxBufferSize = 64 * 1024 * 1024L; + private long slotBufferSize = 1024 * 1024L; + private int numCommitThread = 16; + private String compressAlgorithm = "zlib"; + + public Builder withFlushConcurrent(int flushConcurrent) { + this.flushConcurrent = flushConcurrent; + return this; + } + + public Builder withMaxBufferSize(long maxBufferSize) { + this.maxBufferSize = maxBufferSize; + return this; + } + + public Builder withSlotBufferSize(long slotBufferSize) { + this.slotBufferSize = slotBufferSize; + return this; + } + + public Builder withNumCommitThread(int numCommitThread) { + this.numCommitThread = numCommitThread; + return this; + } + + public Builder withCompressAlgorithm(String compressAlgorithm) { + this.compressAlgorithm = compressAlgorithm; + return this; + } + + public MaxComputeWriteOptions build() { + return new MaxComputeWriteOptions(this); + } + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/sink/MaxComputeEventSink.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/sink/MaxComputeEventSink.java new file mode 100644 index 0000000000..f860a00397 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/sink/MaxComputeEventSink.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.sink; + +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.cdc.common.event.Event; +import org.apache.flink.cdc.connectors.maxcompute.common.Constant; +import org.apache.flink.cdc.connectors.maxcompute.coordinator.SessionManageCoordinatedOperatorFactory; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeWriteOptions; +import org.apache.flink.cdc.runtime.typeutils.EventTypeInfo; +import org.apache.flink.streaming.api.connector.sink2.WithPreWriteTopology; +import org.apache.flink.streaming.api.datastream.DataStream; +import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator; + +import java.io.IOException; + +/** A {@link Sink} of {@link Event} to MaxCompute. */ +public class MaxComputeEventSink implements Sink, WithPreWriteTopology { + private static final long serialVersionUID = 1L; + private final MaxComputeOptions options; + private final MaxComputeWriteOptions writeOptions; + + public MaxComputeEventSink(MaxComputeOptions options, MaxComputeWriteOptions writeOptions) { + this.options = options; + this.writeOptions = writeOptions; + } + + @Override + public DataStream addPreWriteTopology(DataStream inputDataStream) { + SingleOutputStreamOperator stream = + inputDataStream.transform( + "SessionManageOperator", + new EventTypeInfo(), + new SessionManageCoordinatedOperatorFactory( + options, writeOptions, options.getSchemaOperatorUid())); + stream.uid(Constant.PIPELINE_SESSION_MANAGE_OPERATOR_UID); + return stream; + } + + @Override + public SinkWriter createWriter(InitContext context) throws IOException { + return new MaxComputeEventWriter(options, writeOptions, context); + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/sink/MaxComputeEventWriter.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/sink/MaxComputeEventWriter.java new file mode 100644 index 0000000000..a1d3dad1c2 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/sink/MaxComputeEventWriter.java @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.sink; + +import org.apache.flink.api.connector.sink2.Sink; +import org.apache.flink.api.connector.sink2.SinkWriter; +import org.apache.flink.cdc.common.event.CreateTableEvent; +import org.apache.flink.cdc.common.event.DataChangeEvent; +import org.apache.flink.cdc.common.event.Event; +import org.apache.flink.cdc.common.event.OperationType; +import org.apache.flink.cdc.common.event.SchemaChangeEvent; +import org.apache.flink.cdc.common.event.TableId; +import org.apache.flink.cdc.common.schema.Schema; +import org.apache.flink.cdc.common.utils.SchemaUtils; +import org.apache.flink.cdc.connectors.maxcompute.common.Constant; +import org.apache.flink.cdc.connectors.maxcompute.common.SessionIdentifier; +import org.apache.flink.cdc.connectors.maxcompute.coordinator.SessionManageOperator; +import org.apache.flink.cdc.connectors.maxcompute.coordinator.message.CommitSessionRequest; +import org.apache.flink.cdc.connectors.maxcompute.coordinator.message.CommitSessionResponse; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeWriteOptions; +import org.apache.flink.cdc.connectors.maxcompute.utils.MaxComputeUtils; +import org.apache.flink.cdc.connectors.maxcompute.utils.TypeConvertUtils; +import org.apache.flink.cdc.connectors.maxcompute.writer.MaxComputeWriter; +import org.apache.flink.cdc.runtime.operators.schema.event.CoordinationResponseUtils; +import org.apache.flink.runtime.operators.coordination.CoordinationResponse; + +import com.aliyun.odps.data.ArrayRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; + +/** a {@link SinkWriter} for {@link Event} for MaxCompute. */ +public class MaxComputeEventWriter implements SinkWriter { + private static final Logger LOG = LoggerFactory.getLogger(MaxComputeEventWriter.class); + + private final Sink.InitContext context; + private final MaxComputeOptions options; + private final MaxComputeWriteOptions writeOptions; + private final Map writerMap; + private final Map schemaCache; + + public MaxComputeEventWriter( + MaxComputeOptions options, + MaxComputeWriteOptions writeOptions, + Sink.InitContext context) { + this.context = context; + this.options = options; + this.writeOptions = writeOptions; + + this.writerMap = new HashMap<>(); + this.schemaCache = new HashMap<>(); + } + + @Override + public void write(Event element, Context context) throws IOException { + if (element instanceof DataChangeEvent) { + DataChangeEvent dataChangeEvent = (DataChangeEvent) element; + String sessionId = dataChangeEvent.meta().get(Constant.TUNNEL_SESSION_ID); + String partitionName = dataChangeEvent.meta().get(Constant.MAXCOMPUTE_PARTITION_NAME); + if (!writerMap.containsKey(sessionId)) { + LOG.info( + "Sink writer {} start to create session {}.", + this.context.getSubtaskId(), + sessionId); + SessionIdentifier sessionIdentifier = + SessionIdentifier.of( + options.getProject(), + MaxComputeUtils.getSchema(options, dataChangeEvent.tableId()), + dataChangeEvent.tableId().getTableName(), + partitionName, + sessionId); + writerMap.put( + sessionId, + MaxComputeWriter.batchWriter(options, writeOptions, sessionIdentifier)); + } + MaxComputeWriter writer = writerMap.get(sessionId); + ArrayRecord record = writer.newElement(); + + if (dataChangeEvent.op() != OperationType.DELETE) { + TypeConvertUtils.toMaxComputeRecord( + schemaCache.get(dataChangeEvent.tableId()), + dataChangeEvent.after(), + record); + writer.write(record); + } else { + TypeConvertUtils.toMaxComputeRecord( + schemaCache.get(dataChangeEvent.tableId()), + dataChangeEvent.before(), + record); + writer.delete(record); + } + } else if (element instanceof CreateTableEvent) { + CreateTableEvent createTableEvent = (CreateTableEvent) element; + schemaCache.put(createTableEvent.tableId(), createTableEvent.getSchema()); + } else if (element instanceof SchemaChangeEvent) { + SchemaChangeEvent schemaChangeEvent = (SchemaChangeEvent) element; + TableId tableId = schemaChangeEvent.tableId(); + Schema newSchema = + SchemaUtils.applySchemaChangeEvent(schemaCache.get(tableId), schemaChangeEvent); + schemaCache.put(tableId, newSchema); + } + } + + @Override + public void flush(boolean endOfInput) throws IOException, InterruptedException { + SessionManageOperator operator = SessionManageOperator.instance; + LOG.info("Sink writer {} start to flush.", context.getSubtaskId()); + List> responces = new ArrayList<>(writerMap.size() + 1); + writerMap.entrySet().stream() + .sorted(Map.Entry.comparingByKey()) + .forEach( + entry -> { + try { + entry.getValue().flush(); + Future future = + operator.submitRequestToOperator( + new CommitSessionRequest( + context.getSubtaskId(), entry.getKey())); + responces.add(future); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + writerMap.clear(); + Future future = + operator.submitRequestToOperator( + new CommitSessionRequest(context.getSubtaskId(), Constant.END_OF_SESSION)); + responces.add(future); + try { + for (Future response : responces) { + CommitSessionResponse commitSessionResponse = + CoordinationResponseUtils.unwrap(response.get()); + if (!commitSessionResponse.isSuccess()) { + throw new IOException( + "JobManager commit session failed. restart all TaskManager"); + } + } + } catch (ExecutionException e) { + throw new IOException(e); + } + LOG.info("Sink writer {} flush success.", context.getSubtaskId()); + } + + @Override + public void close() throws Exception { + // do nothing + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/sink/MaxComputeHashFunctionProvider.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/sink/MaxComputeHashFunctionProvider.java new file mode 100644 index 0000000000..a8ef034430 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/sink/MaxComputeHashFunctionProvider.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.sink; + +import org.apache.flink.cdc.common.data.RecordData; +import org.apache.flink.cdc.common.event.DataChangeEvent; +import org.apache.flink.cdc.common.event.OperationType; +import org.apache.flink.cdc.common.event.TableId; +import org.apache.flink.cdc.common.function.HashFunction; +import org.apache.flink.cdc.common.function.HashFunctionProvider; +import org.apache.flink.cdc.common.schema.Schema; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.utils.TypeConvertUtils; + +import com.aliyun.odps.tunnel.hasher.TypeHasher; + +import javax.annotation.Nullable; + +import java.util.ArrayList; +import java.util.List; + +/** + * Hash function for maxcompute to distribute data change event to different maxcompute sink by + * primary key. + */ +public class MaxComputeHashFunctionProvider implements HashFunctionProvider { + private static final long serialVersionUID = 1L; + private final int bucketSize; + + public MaxComputeHashFunctionProvider(MaxComputeOptions options) { + this.bucketSize = options.getBucketSize(); + } + + @Override + public HashFunction getHashFunction(@Nullable TableId tableId, Schema schema) { + return new MaxComputeHashFunction(schema, bucketSize); + } + + static class MaxComputeHashFunction implements HashFunction { + private final int bucketSize; + private final List primaryKeyGetters; + + public MaxComputeHashFunction(Schema schema, int bucketSize) { + primaryKeyGetters = createFieldGetters(schema); + this.bucketSize = bucketSize; + } + + @Override + public int hashcode(DataChangeEvent event) { + List hashes = new ArrayList<>(); + RecordData data = + event.op().equals(OperationType.DELETE) ? event.before() : event.after(); + for (RecordData.FieldGetter primaryKeyGetter : primaryKeyGetters) { + Object object = primaryKeyGetter.getFieldOrNull(data); + int hash = + object == null + ? 0 + : TypeHasher.hash( + TypeConvertUtils.inferMaxComputeType(object), object); + hashes.add(hash); + } + return TypeHasher.CombineHashVal(hashes) % bucketSize; + } + + private List createFieldGetters(Schema schema) { + List fieldGetters = + new ArrayList<>(schema.primaryKeys().size()); + schema.primaryKeys().stream() + .mapToInt( + pk -> { + int index = schema.getColumnNames().indexOf(pk); + if (index == -1) { + throw new IllegalStateException( + String.format( + "Unable to find column \"%s\" which is defined as primary key", + pk)); + } + return index; + }) + .forEach( + primaryKeyPosition -> + fieldGetters.add( + TypeConvertUtils.createFieldGetter( + schema.getColumns() + .get(primaryKeyPosition) + .getType(), + primaryKeyPosition))); + return fieldGetters; + } + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/MaxComputeUtils.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/MaxComputeUtils.java new file mode 100644 index 0000000000..f4e5b6979f --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/MaxComputeUtils.java @@ -0,0 +1,244 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.utils; + +import org.apache.flink.cdc.common.event.TableId; +import org.apache.flink.cdc.common.utils.StringUtils; +import org.apache.flink.cdc.connectors.maxcompute.common.Constant; +import org.apache.flink.cdc.connectors.maxcompute.common.SessionIdentifier; +import org.apache.flink.cdc.connectors.maxcompute.common.UncheckedOdpsException; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeWriteOptions; + +import com.aliyun.odps.Column; +import com.aliyun.odps.Odps; +import com.aliyun.odps.OdpsException; +import com.aliyun.odps.PartitionSpec; +import com.aliyun.odps.TableSchema; +import com.aliyun.odps.account.Account; +import com.aliyun.odps.account.AliyunAccount; +import com.aliyun.odps.account.StsAccount; +import com.aliyun.odps.tunnel.Configuration; +import com.aliyun.odps.tunnel.TableTunnel; +import com.aliyun.odps.tunnel.io.CompressOption; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.List; + +/** common utils use for maxcompute connector. */ +public class MaxComputeUtils { + + private static final Logger LOG = LoggerFactory.getLogger(MaxComputeUtils.class); + + public static Odps getOdps(MaxComputeOptions maxComputeOptions) { + Account account; + if (StringUtils.isNullOrWhitespaceOnly(maxComputeOptions.getStsToken())) { + account = + new AliyunAccount( + maxComputeOptions.getAccessId(), maxComputeOptions.getAccessKey()); + } else { + account = + new StsAccount( + maxComputeOptions.getAccessId(), + maxComputeOptions.getAccessKey(), + maxComputeOptions.getStsToken()); + } + Odps odps = new Odps(account); + odps.setEndpoint(maxComputeOptions.getEndpoint()); + odps.setTunnelEndpoint(maxComputeOptions.getTunnelEndpoint()); + odps.setDefaultProject(maxComputeOptions.getProject()); + odps.setUserAgent("Flink CDC"); + return odps; + } + + public static TableTunnel getTunnel( + MaxComputeOptions maxComputeOptions, MaxComputeWriteOptions writeOptions) { + Odps odps = getOdps(maxComputeOptions); + Configuration configuration = + Configuration.builder(odps) + .withRetryLogger(RetryUtils.getRetryLogger()) + .withRetryPolicy(new RetryUtils.FlinkDefaultRetryPolicy()) + .withCompressOptions( + MaxComputeUtils.compressOptionOf( + writeOptions.getCompressAlgorithm())) + .withQuotaName(maxComputeOptions.getQuotaName()) + .build(); + TableTunnel tunnel = new TableTunnel(odps, configuration); + if (!StringUtils.isNullOrWhitespaceOnly(maxComputeOptions.getTunnelEndpoint())) { + tunnel.setEndpoint(maxComputeOptions.getTunnelEndpoint()); + } + return tunnel; + } + + public static TableSchema getTableSchema(MaxComputeOptions options, TableId tableId) { + Odps odps = getOdps(options); + if (options.isSupportSchema()) { + return odps.tables() + .get(options.getProject(), tableId.getNamespace(), tableId.getTableName()) + .getSchema(); + } else { + return odps.tables().get(options.getProject(), tableId.getTableName()).getSchema(); + } + } + + public static boolean supportSchema(MaxComputeOptions maxComputeOptions) { + Odps odps = getOdps(maxComputeOptions); + try { + boolean flag = + Boolean.parseBoolean( + odps.projects().get().getProperty(Constant.SCHEMA_ENABLE_FLAG)); + LOG.info("project {} is support schema: {}", maxComputeOptions.getProject(), flag); + return flag; + } catch (OdpsException e) { + throw new UncheckedOdpsException(e); + } + } + + public static CompressOption compressOptionOf(String compressAlgo) { + CompressOption.CompressAlgorithm compressAlgorithm; + switch (compressAlgo) { + case "raw": + compressAlgorithm = CompressOption.CompressAlgorithm.ODPS_RAW; + break; + case "zlib": + compressAlgorithm = CompressOption.CompressAlgorithm.ODPS_ZLIB; + break; + case "lz4": + compressAlgorithm = CompressOption.CompressAlgorithm.ODPS_LZ4_FRAME; + break; + case "snappy": + compressAlgorithm = CompressOption.CompressAlgorithm.ODPS_SNAPPY; + break; + default: + throw new IllegalArgumentException( + "unknown compress algo: " + + compressAlgo + + " , only support raw, zlib, lz4, snappy"); + } + return new CompressOption(compressAlgorithm, 1, 0); + } + + public static boolean isTableExist(MaxComputeOptions maxComputeOptions, TableId tableId) { + Odps odps = getOdps(maxComputeOptions); + try { + if (maxComputeOptions.isSupportSchema()) { + return odps.tables() + .exists( + odps.getDefaultProject(), + tableId.getNamespace(), + tableId.getTableName()); + } else { + return odps.tables().exists(tableId.getTableName()); + } + } catch (OdpsException e) { + throw new UncheckedOdpsException(e); + } + } + + public static boolean schemaEquals(TableSchema currentSchema, TableSchema expectSchema) { + List currentColumns = currentSchema.getAllColumns(); + List expectColumns = expectSchema.getAllColumns(); + if (currentColumns.size() != expectColumns.size() + || currentSchema.getColumns().size() != expectSchema.getColumns().size()) { + LOG.error( + "current column size not equals to expect column size: {}, {}", + currentColumns.size(), + expectColumns.size()); + return false; + } + for (int i = 0; i < currentColumns.size(); i++) { + if (!currentColumns.get(i).getName().equalsIgnoreCase(expectColumns.get(i).getName())) { + LOG.error( + "current column {} name not equals to expect column name: {}", + currentColumns.get(i).getName(), + expectColumns.get(i).getName()); + return false; + } + if (!currentColumns + .get(i) + .getTypeInfo() + .getTypeName() + .equals(expectColumns.get(i).getTypeInfo().getTypeName())) { + LOG.error( + "current column {} type not equals to expect column type: {}", + currentColumns.get(i).getTypeInfo().getTypeName(), + expectColumns.get(i).getTypeInfo().getTypeName()); + return false; + } + } + return true; + } + + public static void createPartitionIfAbsent( + MaxComputeOptions options, String schema, String table, String partitionName) + throws OdpsException { + Odps odps = getOdps(options); + if (options.isSupportSchema()) { + if (StringUtils.isNullOrWhitespaceOnly(schema)) { + LOG.info( + "create partition {} in {}.default.{}", + partitionName, + options.getProject(), + table); + odps.tables() + .get(options.getProject(), "default", table) + .createPartition(new PartitionSpec(partitionName), true); + } else { + LOG.info( + "create partition {} in {}.{}.{}", + partitionName, + options.getProject(), + schema, + table); + odps.tables() + .get(options.getProject(), schema, table) + .createPartition(new PartitionSpec(partitionName), true); + } + } else { + LOG.info("create partition {} in {}.{}", partitionName, options.getProject(), table); + odps.tables() + .get(options.getProject(), table) + .createPartition(new PartitionSpec(partitionName), true); + } + } + + public static String getSchema(MaxComputeOptions options, TableId tableId) { + if (options.isSupportSchema()) { + if (tableId.getNamespace() == null) { + return "default"; + } else { + return tableId.getNamespace(); + } + } else { + return null; + } + } + + public static boolean isTransactionalTable( + MaxComputeOptions options, SessionIdentifier sessionIdentifier) { + Odps odps = getOdps(options); + return odps.tables() + .get( + sessionIdentifier.getProject(), + sessionIdentifier.getSchema(), + sessionIdentifier.getTable()) + .isTransactional(); + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/RetryUtils.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/RetryUtils.java new file mode 100644 index 0000000000..1d7b7d3d15 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/RetryUtils.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.utils; + +import com.aliyun.odps.OdpsException; +import com.aliyun.odps.rest.RestClient; +import com.aliyun.odps.tunnel.io.TunnelRetryHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.util.concurrent.Callable; +import java.util.concurrent.TimeUnit; + +/** + * Retry utilities to execute a callable with specific retry times. Set MAX_RETRIES and RETRY_DELAY + * at the start of {@link org.apache.flink.cdc.connectors.maxcompute.sink.MaxComputeEventSink}. + */ +public class RetryUtils { + public static final Logger LOG = LoggerFactory.getLogger(RetryUtils.class); + private static final int DEFAULT_MAX_RETRIES = 3; + private static final long DEFAULT_RETRY_DELAY = 5000; + private static final RetryLogger RETRY_LOGGER = new RetryLogger(); + + public static RetryLogger getRetryLogger() { + return RETRY_LOGGER; + } + + /** + * Executes a callable with default retry strategy. + * + * @param callable the task to be executed + * @param the type of the task's result + * @return the task result + * @throws IOException If the task fails after all retries + */ + public static T execute(Callable callable) throws IOException { + return execute(callable, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_DELAY); + } + + /** + * Executes a callable with specific retry strategy. + * + * @param callable the task to be executed + * @param maxRetries the maximum number of retries + * @param retryDelay the delay between retries in milliseconds + * @param the type of the task's result + * @return the task result + * @throws IOException If the task fails after all retries + */ + public static T execute(Callable callable, int maxRetries, long retryDelay) + throws IOException { + int attempt = 0; + while (true) { + try { + return callable.call(); + } catch (Exception e) { + attempt++; + if (attempt > maxRetries) { + if (e instanceof OdpsException) { + throw new IOException( + "Failed after retries. RequestId: " + + ((OdpsException) e).getRequestId(), + e); + } + throw new IOException("Failed after retries", e); + } + try { + RETRY_LOGGER.onRetryLog(e, attempt, retryDelay); + Thread.sleep(retryDelay); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new IOException("Retry interrupted", ie); + } + } + } + } + + /** + * Executes a callable with default retry strategy and unchecked exceptions. + * + * @param callable the task to be executed + * @param the type of the task's result + * @return the task result + */ + public static T executeUnchecked(Callable callable) { + return executeUnchecked(callable, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_DELAY); + } + + /** + * Executes a callable with specific retry strategy and unchecked exceptions. + * + * @param callable the task to be executed + * @param maxRetries the maximum number of retries + * @param retryDelay the delay between retries in milliseconds + * @param the type of the task's result + * @return the task result + */ + public static T executeUnchecked(Callable callable, int maxRetries, long retryDelay) { + try { + return execute(callable, maxRetries, retryDelay); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + + static class RetryLogger extends RestClient.RetryLogger { + @Override + public void onRetryLog(Throwable e, long retryCount, long retrySleepTime) { + // Log the exception and retry details + LOG.warn( + "Retry attempt #{} failed. Exception: {}. Sleeping for {} ms before next attempt.", + retryCount, + e.getMessage(), + retrySleepTime, + e); + } + } + + // retry 3 times and wait 5 seconds for each retry + static class FlinkDefaultRetryPolicy implements TunnelRetryHandler.RetryPolicy { + @Override + public boolean shouldRetry(Exception e, int attempt) { + return attempt <= DEFAULT_MAX_RETRIES; + } + + @Override + public long getRetryWaitTime(int attempt) { + return TimeUnit.MILLISECONDS.toMillis(DEFAULT_RETRY_DELAY); + } + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/SchemaEvolutionUtils.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/SchemaEvolutionUtils.java new file mode 100644 index 0000000000..95989c92e4 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/SchemaEvolutionUtils.java @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.utils; + +import org.apache.flink.cdc.common.event.AddColumnEvent; +import org.apache.flink.cdc.common.event.TableId; +import org.apache.flink.cdc.common.schema.Schema; +import org.apache.flink.cdc.common.types.DataType; +import org.apache.flink.cdc.common.utils.StringUtils; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.util.CollectionUtil; + +import com.aliyun.odps.Instance; +import com.aliyun.odps.Odps; +import com.aliyun.odps.OdpsException; +import com.aliyun.odps.TableSchema; +import com.aliyun.odps.Tables; +import com.aliyun.odps.task.SQLTask; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** Schema evolution utils for maxcompute. */ +public class SchemaEvolutionUtils { + private static final Logger LOG = LoggerFactory.getLogger(SchemaEvolutionUtils.class); + private static final Map unsupportSchemahints = new HashMap<>(); + private static final Map supportSchemaHints = new HashMap<>(); + + static { + unsupportSchemahints.put("odps.sql.type.system.odps2", "true"); + unsupportSchemahints.put("odps.sql.decimal.odps2", "true"); + unsupportSchemahints.put("odps.sql.allow.schema.evolution", "true"); + + supportSchemaHints.put("odps.sql.type.system.odps2", "true"); + supportSchemaHints.put("odps.sql.decimal.odps2", "true"); + supportSchemaHints.put("odps.namespace.schema", "true"); + supportSchemaHints.put("odps.sql.allow.namespace.schema", "true"); + supportSchemaHints.put("odps.sql.allow.schema.evolution", "true"); + } + + private SchemaEvolutionUtils() {} + + /** + * equals to run a sql like: create table table_name (col_name1 type1 comment [, col_name2 type2 + * ...]);. + */ + public static void createTable(MaxComputeOptions options, TableId tableId, Schema schema) + throws OdpsException { + Odps odps = MaxComputeUtils.getOdps(options); + TableSchema tableSchema = TypeConvertUtils.toMaxCompute(schema); + if (options.isSupportSchema() + && !StringUtils.isNullOrWhitespaceOnly(tableId.getNamespace())) { + LOG.info("create schema {}", tableId.getNamespace()); + odps.schemas() + .create( + odps.getDefaultProject(), + tableId.getNamespace(), + "generate by Flink CDC", + true); + } + Tables.TableCreator tableCreator = + odps.tables() + .newTableCreator( + odps.getDefaultProject(), tableId.getTableName(), tableSchema) + .withHints(unsupportSchemahints) + .ifNotExists() + .debug(); + if (!CollectionUtil.isNullOrEmpty(schema.primaryKeys())) { + tableCreator + .transactionTable() + .withBucketNum(options.getBucketSize()) + .withPrimaryKeys(schema.primaryKeys()); + } + if (options.isSupportSchema()) { + if (StringUtils.isNullOrWhitespaceOnly(tableId.getNamespace())) { + tableCreator.withSchemaName("default").withHints(supportSchemaHints); + } else { + tableCreator.withSchemaName(tableId.getNamespace()).withHints(supportSchemaHints); + } + } + LOG.info("create table {}, schema {}", getFullTableName(options, tableId), schema); + tableCreator.create(); + } + + /** + * equals to run a sql like: 'alter table table_name add columns (col_name1 type1 comment [, + * col_name2 type2 ...]);'. + */ + public static void addColumns( + MaxComputeOptions options, + TableId tableId, + List columns) + throws OdpsException { + Odps odps = MaxComputeUtils.getOdps(options); + + StringBuilder sqlBuilder = + new StringBuilder( + "alter table " + getFullTableName(options, tableId) + " add columns ("); + + for (AddColumnEvent.ColumnWithPosition addColumn : columns) { + if (addColumn.getPosition() == AddColumnEvent.ColumnPosition.LAST) { + sqlBuilder + .append(addColumn.getAddColumn().getName()) + .append(" ") + .append(string(addColumn.getAddColumn().getType())) + .append(" comment '") + .append(addColumn.getAddColumn().getType().asSummaryString()) + .append("',"); + } else { + throw new UnsupportedOperationException( + "Not support position: " + + addColumn.getPosition() + + " " + + addColumn.getExistedColumnName()); + } + } + // remove ',' + sqlBuilder.deleteCharAt(sqlBuilder.length() - 1); + sqlBuilder.append(");"); + + Instance instance = + SQLTask.run( + odps, + odps.getDefaultProject(), + sqlBuilder.toString(), + options.isSupportSchema() ? supportSchemaHints : unsupportSchemahints, + null); + LOG.info("execute add column task: `{}`, instanceId: {}", sqlBuilder, instance.getId()); + instance.waitForSuccess(); + } + + /** + * equals to run a sql like: 'alter table table_name change column old_column_name + * new_column_name new_data_type;'. and 'alter table table_name change column col_name comment + * 'col_comment''; + */ + public static void alterColumnType( + MaxComputeOptions options, TableId tableId, Map typeMapping) + throws OdpsException { + Odps odps = MaxComputeUtils.getOdps(options); + + String prefix = "alter table " + getFullTableName(options, tableId) + " change column "; + + for (Map.Entry entry : typeMapping.entrySet()) { + String alterColumnSql = + prefix + + entry.getKey() + + " " + + entry.getKey() + + " " + + string(entry.getValue()) + + ";"; + Instance instance = + SQLTask.run( + odps, + odps.getDefaultProject(), + alterColumnSql, + options.isSupportSchema() ? supportSchemaHints : unsupportSchemahints, + null); + LOG.info( + "execute alter column task: `{}`, instanceId: {}", + alterColumnSql, + instance.getId()); + instance.waitForSuccess(); + } + } + + /** + * equals to run a sql like: 'alter table table_name drop columns col_name1[, col_name2...];'. + */ + public static void dropColumn( + MaxComputeOptions options, TableId tableId, List droppedColumnNames) + throws OdpsException { + Odps odps = MaxComputeUtils.getOdps(options); + StringBuilder sqlBuilder = + new StringBuilder( + "alter table " + getFullTableName(options, tableId) + " drop columns "); + for (String column : droppedColumnNames) { + sqlBuilder.append(column).append(","); + } + // remove ',' + sqlBuilder.deleteCharAt(sqlBuilder.length() - 1); + sqlBuilder.append(";"); + Instance instance = + SQLTask.run( + odps, + odps.getDefaultProject(), + sqlBuilder.toString(), + options.isSupportSchema() ? supportSchemaHints : unsupportSchemahints, + null); + LOG.info("execute drop column task: `{}`, instanceId: {}", sqlBuilder, instance.getId()); + instance.waitForSuccess(); + } + + /** + * equals to run a sql like: 'alter table table_name change column old_col_name rename to + * new_col_name;'. + */ + public static void renameColumn( + MaxComputeOptions options, TableId tableId, Map nameMapping) + throws OdpsException { + Odps odps = MaxComputeUtils.getOdps(options); + String prefix = "alter table " + getFullTableName(options, tableId) + " change column "; + for (Map.Entry entry : nameMapping.entrySet()) { + String sql = prefix + entry.getKey() + " rename to " + entry.getValue() + ";"; + Instance instance = + SQLTask.run( + odps, + odps.getDefaultProject(), + sql, + options.isSupportSchema() ? supportSchemaHints : unsupportSchemahints, + null); + LOG.info("execute rename column task: `{}`, instanceId: {}", sql, instance.getId()); + instance.waitForSuccess(); + } + } + + private static String getFullTableName(MaxComputeOptions options, TableId tableId) { + if (options.isSupportSchema()) { + if (StringUtils.isNullOrWhitespaceOnly(tableId.getNamespace())) { + return options.getProject() + ".default." + tableId.getTableName(); + } else { + return options.getProject() + + "." + + tableId.getNamespace() + + "." + + tableId.getTableName(); + } + } else { + return options.getProject() + "." + tableId.getTableName(); + } + } + + private static String string(DataType dataType) { + return TypeConvertUtils.toMaxCompute(dataType).getTypeName(); + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/SessionCommitCoordinateHelper.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/SessionCommitCoordinateHelper.java new file mode 100644 index 0000000000..011b9e7d6b --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/SessionCommitCoordinateHelper.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.utils; + +import org.apache.flink.cdc.common.utils.Preconditions; +import org.apache.flink.cdc.connectors.maxcompute.common.Constant; +import org.apache.flink.cdc.connectors.maxcompute.coordinator.message.CommitSessionResponse; +import org.apache.flink.cdc.runtime.operators.schema.event.CoordinationResponseUtils; +import org.apache.flink.runtime.operators.coordination.CoordinationResponse; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayDeque; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; +import java.util.Queue; +import java.util.concurrent.CompletableFuture; + +/** + * The SessionCommitCoordinator class is responsible for coordinating and controlling the order of + * session submissions by multiple concurrent executors in a distributed processing environment. It + * ensures that: 1. Each executor must submit sessions in ascending order by session ID. 2. Each + * executor must submit a Constant.END_OF_SESSION as a terminator after completing its session + * submissions. + * + *

Working Principle: - Maintains an array of queues (toCommitSessionIds), with each queue + * corresponding to an executor, to isolate the session submissions of different executors. This + * maintains the independence and submission order of each executor. - Executors submit session IDs + * sequentially by invoking the commit() method. The commit operation simply enqueues the session ID + * into the corresponding executor's queue. - The getToCommitSessionId() method is tasked with + * selecting the smallest session ID across all executors that has been "submitted" or is "no longer + * required" for submission, allowing for further processing. "Submitted" means that the session ID + * has been submitted by all executors; "no longer required" assumes that any subsequent session IDs + * that are yet to be submitted will always be greater than the currently chosen ID. - Once a + * session ID is selected by the getToCommitSessionId() method, it's removed from all executors' + * queues, indicating that the session ID has been processed. This process ensures ordered + * processing of the sessions and allows the system to efficiently progress. - Each processing step + * of the session IDs is based on a key assumption: that any subsequent session ID submissions will + * always be greater than the current processed session ID. This is guaranteed by the fact that each + * executor commits to submitting session IDs in order and submits a special terminator + * (Constant.END_OF_SESSION) at the end. + * + *

Note: - The class presupposes that all session IDs are comparable, and each executor strictly + * adheres to the submission order of session IDs in ascending order. Any behavior that deviates + * from this principle may lead to unpredictable outcomes, as it contravenes the fundamental + * assumption of the class's design. - The introduction of Constant.END_OF_SESSION as a terminator + * is a key aspect of this coordination strategy, as it provides a clear signal for recognizing the + * completion status of an executor, allowing the system to determine whether all relevant sessions + * have been processed. + */ +public class SessionCommitCoordinateHelper { + private static final Logger LOG = LoggerFactory.getLogger(SessionCommitCoordinateHelper.class); + private final Queue[] toCommitSessionIds; + private final Map> toCommitFutures; + /** + * If any string is {@link Constant#END_OF_SESSION}, it should be considered larger than any + * other non-{@link Constant#END_OF_SESSION} string. + */ + private final Comparator comparator = + (String a, String b) -> { + if (a.equals(Constant.END_OF_SESSION) || b.equals(Constant.END_OF_SESSION)) { + if (a.equals(b)) { + return 0; + } else if (a.equals(Constant.END_OF_SESSION)) { + return 1; + } else { + return -1; + } + } + return a.compareTo(b); + }; + + private boolean isCommitting; + + public SessionCommitCoordinateHelper(int parallelism) { + Preconditions.checkArgument(parallelism > 0); + isCommitting = true; + toCommitFutures = new HashMap<>(); + toCommitSessionIds = new ArrayDeque[parallelism]; + + for (int i = 0; i < parallelism; i++) { + toCommitSessionIds[i] = new ArrayDeque<>(); + } + } + + public void clear() { + for (Queue toCommitSessionId : toCommitSessionIds) { + toCommitSessionId.clear(); + } + toCommitFutures.clear(); + isCommitting = true; + } + + public CompletableFuture commit(int subtaskId, String sessionId) { + LOG.info("subtask {} commit sessionId: {}", subtaskId, sessionId); + toCommitSessionIds[subtaskId].offer(sessionId); + if (toCommitFutures.containsKey(sessionId)) { + return toCommitFutures.get(sessionId); + } + CompletableFuture future = new CompletableFuture<>(); + toCommitFutures.putIfAbsent(sessionId, future); + return future; + } + + public String getToCommitSessionId() { + String peekSession = null; + for (Queue commitSessionId : toCommitSessionIds) { + if (commitSessionId.isEmpty()) { + return null; + } + if (peekSession == null) { + peekSession = commitSessionId.peek(); + } else { + if (comparator.compare(commitSessionId.peek(), peekSession) < 0) { + peekSession = commitSessionId.peek(); + } + } + } + // peekSession cannot be null here. + if (peekSession.equals(Constant.END_OF_SESSION)) { + isCommitting = false; + return null; + } + for (Queue toCommitSessionId : toCommitSessionIds) { + if (toCommitSessionId.peek().equals(peekSession)) { + toCommitSessionId.poll(); + } + } + return peekSession; + } + + public void commitSuccess(String sessionId, boolean success) { + toCommitFutures + .get(sessionId) + .complete(CoordinationResponseUtils.wrap(new CommitSessionResponse(success))); + } + + public boolean isCommitting() { + return isCommitting; + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/TypeConvertUtils.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/TypeConvertUtils.java new file mode 100644 index 0000000000..e87ad27e7c --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/utils/TypeConvertUtils.java @@ -0,0 +1,537 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.utils; + +import org.apache.flink.cdc.common.data.ArrayData; +import org.apache.flink.cdc.common.data.MapData; +import org.apache.flink.cdc.common.data.RecordData; +import org.apache.flink.cdc.common.schema.Schema; +import org.apache.flink.cdc.common.types.ArrayType; +import org.apache.flink.cdc.common.types.DataType; +import org.apache.flink.cdc.common.types.DecimalType; +import org.apache.flink.cdc.common.types.MapType; +import org.apache.flink.cdc.common.types.RowType; +import org.apache.flink.cdc.common.utils.SchemaUtils; + +import com.aliyun.odps.Column; +import com.aliyun.odps.OdpsType; +import com.aliyun.odps.TableSchema; +import com.aliyun.odps.data.ArrayRecord; +import com.aliyun.odps.data.Binary; +import com.aliyun.odps.data.SimpleStruct; +import com.aliyun.odps.data.Struct; +import com.aliyun.odps.table.utils.Preconditions; +import com.aliyun.odps.type.StructTypeInfo; +import com.aliyun.odps.type.TypeInfo; +import com.aliyun.odps.type.TypeInfoFactory; + +import java.math.BigDecimal; +import java.time.Instant; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.time.LocalTime; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.apache.flink.cdc.common.types.DataTypeChecks.getFieldCount; +import static org.apache.flink.cdc.common.types.DataTypeChecks.getPrecision; +import static org.apache.flink.cdc.common.types.DataTypeChecks.getScale; + +/** + * Data type mapping table This table shows the mapping relationship from Flink types to MaxCompute + * types and the corresponding Java type representation. + * + *

+ * | Flink Type                        | MaxCompute Type| Flink Java Type     | MaxCompute Java Type |
+ * |-----------------------------------|----------------|---------------------|----------------------|
+ * | CHAR/VARCHAR/STRING               | STRING         | StringData          | String               |
+ * | BOOLEAN                           | BOOLEAN        | Boolean             | Boolean              |
+ * | BINARY/VARBINARY                  | BINARY         | byte[]              | odps.data.Binary     |
+ * | DECIMAL                           | DECIMAL        | DecimalData         | BigDecimal           |
+ * | TINYINT                           | TINYINT        | Byte                | Byte                 |
+ * | SMALLINT                          | SMALLINT       | Short               | Short                |
+ * | INTEGER                           | INTEGER        | Integer             | Integer              |
+ * | BIGINT                            | BIGINT         | Long                | Long                 |
+ * | FLOAT                             | FLOAT          | Float               | Float                |
+ * | DOUBLE                            | DOUBLE         | Double              | Double               |
+ * | TIME_WITHOUT_TIME_ZONE            | STRING         | Integer             | String               |
+ * | DATE                              | DATE           | Integer             | LocalDate            |
+ * | TIMESTAMP_WITHOUT_TIME_ZONE       | TIMESTAMP_NTZ  | TimestampData       | LocalDateTime        |
+ * | TIMESTAMP_WITH_LOCAL_TIME_ZONE    | TIMESTAMP      | LocalZonedTimestampData | Instant          |
+ * | TIMESTAMP_WITH_TIME_ZONE          | TIMESTAMP      | ZonedTimestampData  | Instant              |
+ * | ARRAY                             | ARRAY          | ArrayData           | ArrayList            |
+ * | MAP                               | MAP            | MapData             | HashMap              |
+ * | ROW                               | STRUCT         | RowData             | odps.data.SimpleStruct|
+ * 
+ */ +public class TypeConvertUtils { + + /** this method ignore the message of primary key in flinkSchema. */ + public static TableSchema toMaxCompute(Schema flinkSchema) { + Preconditions.checkNotNull(flinkSchema, "flink Schema"); + TableSchema tableSchema = new TableSchema(); + Set partitionKeys = new HashSet<>(flinkSchema.partitionKeys()); + List columns = flinkSchema.getColumns(); + Set pkSet = new HashSet<>(flinkSchema.primaryKeys()); + + for (int i = 0; i < flinkSchema.getColumnCount(); i++) { + org.apache.flink.cdc.common.schema.Column flinkColumn = columns.get(i); + Column odpsColumn = toMaxCompute(flinkColumn, pkSet.contains(flinkColumn.getName())); + if (partitionKeys.contains(flinkColumn.getName())) { + tableSchema.addPartitionColumn(odpsColumn); + } else { + tableSchema.addColumn(odpsColumn); + } + } + return tableSchema; + } + + public static Column toMaxCompute( + org.apache.flink.cdc.common.schema.Column flinkColumn, boolean notNull) { + Preconditions.checkNotNull(flinkColumn, "flink Schema Column"); + DataType type = flinkColumn.getType(); + Column.ColumnBuilder columnBuilder = + Column.newBuilder(flinkColumn.getName(), toMaxCompute(type)); + if (!type.isNullable() || notNull) { + columnBuilder.notNull(); + } + return columnBuilder.build(); + } + + public static TypeInfo toMaxCompute(DataType type) { + switch (type.getTypeRoot()) { + case CHAR: + case VARCHAR: + case TIME_WITHOUT_TIME_ZONE: + return TypeInfoFactory.STRING; + case BOOLEAN: + return TypeInfoFactory.BOOLEAN; + case BINARY: + case VARBINARY: + return TypeInfoFactory.BINARY; + case DECIMAL: + DecimalType decimalType = (DecimalType) type; + return TypeInfoFactory.getDecimalTypeInfo( + decimalType.getPrecision(), decimalType.getScale()); + case TINYINT: + return TypeInfoFactory.TINYINT; + case SMALLINT: + return TypeInfoFactory.SMALLINT; + case INTEGER: + return TypeInfoFactory.INT; + case BIGINT: + return TypeInfoFactory.BIGINT; + case FLOAT: + return TypeInfoFactory.FLOAT; + case DOUBLE: + return TypeInfoFactory.DOUBLE; + case DATE: + return TypeInfoFactory.DATE; + case TIMESTAMP_WITHOUT_TIME_ZONE: + return TypeInfoFactory.TIMESTAMP_NTZ; + case TIMESTAMP_WITH_TIME_ZONE: + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + return TypeInfoFactory.TIMESTAMP; + case ARRAY: + ArrayType arrayType = (ArrayType) type; + return TypeInfoFactory.getArrayTypeInfo(toMaxCompute(arrayType.getElementType())); + case MAP: + MapType mapType = (MapType) type; + return TypeInfoFactory.getMapTypeInfo( + toMaxCompute(mapType.getKeyType()), toMaxCompute(mapType.getValueType())); + case ROW: + RowType rowType = (RowType) type; + return TypeInfoFactory.getStructTypeInfo( + rowType.getFieldNames(), + rowType.getFieldTypes().stream() + .map(TypeConvertUtils::toMaxCompute) + .collect(Collectors.toList())); + default: + throw new UnsupportedOperationException("Unsupported type: " + type); + } + } + + public static void toMaxComputeRecord(Schema flinkSchema, RecordData from, ArrayRecord to) { + Preconditions.checkNotNull(from, "flink record data"); + Preconditions.checkNotNull(to, "maxcompute arrayRecord"); + int partitionKeyCount = flinkSchema.partitionKeys().size(); + + List fieldGetters = createFieldGetters(flinkSchema); + + if (to.getColumnCount() != (fieldGetters.size() - partitionKeyCount)) { + throw new IllegalArgumentException( + "record data count not match, odps {" + + Arrays.stream(to.getColumns()) + .map(c -> c.getName() + " " + c.getTypeInfo().getTypeName()) + .collect(Collectors.joining(", ")) + + "} count " + + to.getColumnCount() + + "vs flink {" + + flinkSchema + + "} count " + + (fieldGetters.size() - partitionKeyCount)); + } + for (int i = 0; i < (fieldGetters.size() - partitionKeyCount); i++) { + Object value = fieldGetters.get(i).getFieldOrNull(from); + to.set(i, value); + } + } + + /** + * create a list of {@link RecordData.FieldGetter} from given {@link Schema} to get Object from + * RecordData. + * + *

This method is a modified version of {@link SchemaUtils#createFieldGetters(Schema)}, which + * return MaxCompute Java type + */ + public static List createFieldGetters(Schema schema) { + List fieldGetters = new ArrayList<>(schema.getColumns().size()); + for (int i = 0; i < schema.getColumns().size(); i++) { + fieldGetters.add(createFieldGetter(schema.getColumns().get(i).getType(), i)); + } + return fieldGetters; + } + + /** + * Creates an accessor for getting elements in an internal RecordData structure at the given + * position. + * + *

This method is a modified version of {@link RecordData#createFieldGetter(DataType, int)}, + * which return MaxCompute Java type + * + * @param fieldType the element type of the RecordData + * @param fieldPos the element position of the RecordData + */ + public static RecordData.FieldGetter createFieldGetter(DataType fieldType, int fieldPos) { + final RecordData.FieldGetter fieldGetter; + // ordered by type root definition + switch (fieldType.getTypeRoot()) { + case CHAR: + case VARCHAR: + fieldGetter = record -> record.getString(fieldPos).toString(); + break; + case BOOLEAN: + fieldGetter = record -> record.getBoolean(fieldPos); + break; + case BINARY: + case VARBINARY: + fieldGetter = record -> new Binary(record.getBinary(fieldPos)); + break; + case DECIMAL: + final int decimalPrecision = getPrecision(fieldType); + final int decimalScale = getScale(fieldType); + fieldGetter = + record -> + record.getDecimal(fieldPos, decimalPrecision, decimalScale) + .toBigDecimal(); + break; + case TINYINT: + fieldGetter = record -> record.getByte(fieldPos); + break; + case SMALLINT: + fieldGetter = record -> record.getShort(fieldPos); + break; + case INTEGER: + fieldGetter = record -> record.getInt(fieldPos); + break; + case DATE: + fieldGetter = record -> LocalDate.ofEpochDay(record.getInt(fieldPos)); + break; + case TIME_WITHOUT_TIME_ZONE: + fieldGetter = + record -> LocalTime.ofNanoOfDay(record.getInt(fieldPos) * 1000L).toString(); + break; + case BIGINT: + fieldGetter = record -> record.getLong(fieldPos); + break; + case FLOAT: + fieldGetter = record -> record.getFloat(fieldPos); + break; + case DOUBLE: + fieldGetter = record -> record.getDouble(fieldPos); + break; + case TIMESTAMP_WITHOUT_TIME_ZONE: + fieldGetter = + record -> + record.getTimestamp(fieldPos, getPrecision(fieldType)) + .toLocalDateTime(); + break; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + fieldGetter = + record -> + record.getLocalZonedTimestampData(fieldPos, getPrecision(fieldType)) + .toInstant(); + break; + case TIMESTAMP_WITH_TIME_ZONE: + fieldGetter = + record -> + record.getZonedTimestamp(fieldPos, getPrecision(fieldType)) + .toInstant(); + break; + case ARRAY: + fieldGetter = + record -> { + ArrayData array = record.getArray(fieldPos); + DataType elementType = ((ArrayType) fieldType).getElementType(); + ArrayData.ElementGetter elementGetter = + createElementGetter(elementType); + List listData = new ArrayList<>(); + for (int j = 0; j < array.size(); j++) { + listData.add(elementGetter.getElementOrNull(array, j)); + } + return listData; + }; + break; + case MAP: + fieldGetter = + record -> { + MapData map = record.getMap(fieldPos); + ArrayData keyArrayData = map.keyArray(); + ArrayData valueArrayData = map.valueArray(); + DataType keyType = ((MapType) fieldType).getKeyType(); + DataType valueType = ((MapType) fieldType).getValueType(); + ArrayData.ElementGetter keyElementGetter = createElementGetter(keyType); + ArrayData.ElementGetter valueElementGetter = + createElementGetter(valueType); + Map mapData = new HashMap<>(); + for (int j = 0; j < map.size(); j++) { + mapData.put( + keyElementGetter.getElementOrNull(keyArrayData, j), + valueElementGetter.getElementOrNull(valueArrayData, j)); + } + return mapData; + }; + break; + case ROW: + final int rowFieldCount = getFieldCount(fieldType); + fieldGetter = + row -> { + RecordData recordData = row.getRow(fieldPos, rowFieldCount); + RowType rowType = (RowType) fieldType; + StructTypeInfo structTypeInfo = (StructTypeInfo) toMaxCompute(rowType); + List fieldTypes = rowType.getFieldTypes(); + List values = new ArrayList<>(); + for (int j = 0; j < fieldTypes.size(); j++) { + values.add( + createFieldGetter(fieldTypes.get(j), j) + .getFieldOrNull(recordData)); + } + return new SimpleStruct(structTypeInfo, values); + }; + break; + default: + throw new IllegalArgumentException(); + } + if (!fieldType.isNullable()) { + return fieldGetter; + } + return row -> { + if (row.isNullAt(fieldPos)) { + return null; + } + return fieldGetter.getFieldOrNull(row); + }; + } + + /** + * Creates an accessor for getting elements in an internal array data structure at the given + * position. + * + *

This method is a modified version of {@link ArrayData#createElementGetter(DataType)}, + * which return MaxCompute Java type + * + * @param elementType the element type of the array + */ + private static ArrayData.ElementGetter createElementGetter(DataType elementType) { + final ArrayData.ElementGetter elementGetter; + // ordered by type root definition + switch (elementType.getTypeRoot()) { + case CHAR: + case VARCHAR: + elementGetter = (array, pos) -> array.getString(pos).toString(); + break; + case BOOLEAN: + elementGetter = ArrayData::getBoolean; + break; + case BINARY: + case VARBINARY: + elementGetter = (array, pos) -> new Binary(array.getBinary(pos)); + break; + case DECIMAL: + final int decimalPrecision = getPrecision(elementType); + final int decimalScale = getScale(elementType); + elementGetter = + (array, pos) -> + array.getDecimal(pos, decimalPrecision, decimalScale) + .toBigDecimal(); + break; + case TINYINT: + elementGetter = ArrayData::getByte; + break; + case SMALLINT: + elementGetter = ArrayData::getShort; + break; + case INTEGER: + elementGetter = ArrayData::getInt; + break; + case DATE: + elementGetter = (array, pos) -> LocalDate.ofEpochDay(array.getInt(pos)); + break; + case TIME_WITHOUT_TIME_ZONE: + elementGetter = + (array, pos) -> LocalTime.ofNanoOfDay(array.getInt(pos) * 1000L).toString(); + break; + case BIGINT: + elementGetter = ArrayData::getLong; + break; + case FLOAT: + elementGetter = ArrayData::getFloat; + break; + case DOUBLE: + elementGetter = ArrayData::getDouble; + break; + case TIMESTAMP_WITHOUT_TIME_ZONE: + final int timestampPrecision = getPrecision(elementType); + elementGetter = + (array, pos) -> + array.getTimestamp(pos, timestampPrecision).toLocalDateTime(); + break; + case TIMESTAMP_WITH_LOCAL_TIME_ZONE: + final int timestampLtzPrecision = getPrecision(elementType); + elementGetter = + (array, pos) -> + array.getLocalZonedTimestamp(pos, timestampLtzPrecision) + .toInstant(); + break; + case TIMESTAMP_WITH_TIME_ZONE: + final int timestampTzPrecision = getPrecision(elementType); + elementGetter = + (array, pos) -> + array.getZonedTimestamp(pos, timestampTzPrecision).toInstant(); + break; + case ARRAY: + elementGetter = + (record, fieldPos) -> { + ArrayData array = record.getArray(fieldPos); + DataType elementTypeInternal = + ((ArrayType) elementType).getElementType(); + ArrayData.ElementGetter elementGetterInternal = + createElementGetter(elementTypeInternal); + List listData = new ArrayList<>(); + for (int j = 0; j < array.size(); j++) { + listData.add(elementGetterInternal.getElementOrNull(array, j)); + } + return listData; + }; + break; + case MAP: + elementGetter = + (record, fieldPos) -> { + MapData map = record.getMap(fieldPos); + ArrayData keyArrayData = map.keyArray(); + ArrayData valueArrayData = map.valueArray(); + DataType keyType = ((MapType) elementType).getKeyType(); + DataType valueType = ((MapType) elementType).getValueType(); + ArrayData.ElementGetter keyElementGetter = createElementGetter(keyType); + ArrayData.ElementGetter valueElementGetter = + createElementGetter(valueType); + Map mapData = new HashMap<>(); + for (int j = 0; j < map.size(); j++) { + mapData.put( + keyElementGetter.getElementOrNull(keyArrayData, j), + valueElementGetter.getElementOrNull(valueArrayData, j)); + } + return mapData; + }; + break; + case ROW: + final int rowFieldCount = getFieldCount(elementType); + elementGetter = + (array, pos) -> { + RecordData recordData = array.getRecord(pos, rowFieldCount); + RowType rowType = (RowType) elementType; + StructTypeInfo structTypeInfo = (StructTypeInfo) toMaxCompute(rowType); + List fieldTypes = rowType.getFieldTypes(); + List values = new ArrayList<>(); + for (int j = 0; j < fieldTypes.size(); j++) { + values.add( + createFieldGetter(fieldTypes.get(j), j) + .getFieldOrNull(recordData)); + } + return new SimpleStruct(structTypeInfo, values); + }; + break; + default: + throw new IllegalArgumentException(); + } + if (!elementType.isNullable()) { + return elementGetter; + } + return (array, pos) -> { + if (array.isNullAt(pos)) { + return null; + } + return elementGetter.getElementOrNull(array, pos); + }; + } + + public static OdpsType inferMaxComputeType(Object object) { + if (object instanceof String) { + return OdpsType.STRING; + } else if (object instanceof Boolean) { + return OdpsType.BOOLEAN; + } else if (object instanceof Binary) { + return OdpsType.BINARY; + } else if (object instanceof BigDecimal) { + return OdpsType.DECIMAL; + } else if (object instanceof Byte) { + return OdpsType.TINYINT; + } else if (object instanceof Short) { + return OdpsType.SMALLINT; + } else if (object instanceof Integer) { + return OdpsType.INT; + } else if (object instanceof Long) { + return OdpsType.BIGINT; + } else if (object instanceof Float) { + return OdpsType.FLOAT; + } else if (object instanceof Double) { + return OdpsType.DOUBLE; + } else if (object instanceof LocalDate) { + return OdpsType.DATE; + } else if (object instanceof LocalDateTime) { + return OdpsType.TIMESTAMP_NTZ; + } else if (object instanceof Instant) { + return OdpsType.TIMESTAMP; + } else if (object instanceof List) { + return OdpsType.ARRAY; + } else if (object instanceof Map) { + return OdpsType.MAP; + } else if (object instanceof Struct) { + return OdpsType.STRUCT; + } else { + throw new IllegalArgumentException("Unsupported type: " + object.getClass()); + } + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/writer/BatchAppendWriter.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/writer/BatchAppendWriter.java new file mode 100644 index 0000000000..1cf6be4c0a --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/writer/BatchAppendWriter.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.writer; + +import org.apache.flink.cdc.common.utils.StringUtils; +import org.apache.flink.cdc.connectors.maxcompute.common.SessionIdentifier; +import org.apache.flink.cdc.connectors.maxcompute.common.UncheckedOdpsException; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeWriteOptions; +import org.apache.flink.cdc.connectors.maxcompute.utils.MaxComputeUtils; + +import com.aliyun.odps.OdpsException; +import com.aliyun.odps.PartitionSpec; +import com.aliyun.odps.data.ArrayRecord; +import com.aliyun.odps.data.RecordWriter; +import com.aliyun.odps.tunnel.TableTunnel; +import com.aliyun.odps.tunnel.TunnelException; +import com.aliyun.odps.tunnel.impl.UpsertSessionImpl; +import com.aliyun.odps.tunnel.io.TunnelBufferedWriter; +import com.aliyun.odps.tunnel.streams.UpsertStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + +/** + * MaxCompute upsert writer, use {@link UpsertSessionImpl} and {@link UpsertStream} to write data. + * Each session corresponds to a stream. + */ +public class BatchAppendWriter implements MaxComputeWriter { + + private static final Logger LOG = LoggerFactory.getLogger(BatchAppendWriter.class); + + private final MaxComputeOptions options; + private final MaxComputeWriteOptions writeOptions; + private final SessionIdentifier sessionIdentifier; + private final TableTunnel tunnel; + private TableTunnel.UploadSession uploadSession; + private RecordWriter recordWriter; + + public BatchAppendWriter( + MaxComputeOptions options, + MaxComputeWriteOptions writeOptions, + SessionIdentifier sessionIdentifier) { + this.options = options; + this.writeOptions = writeOptions; + + this.tunnel = MaxComputeUtils.getTunnel(options, writeOptions); + this.sessionIdentifier = sessionIdentifier; + + LOG.info("sink writer reload session: {}", sessionIdentifier); + initOrReloadSession(sessionIdentifier); + } + + private void initOrReloadSession(SessionIdentifier identifier) { + String partitionSpec = identifier.getPartitionName(); + String sessionId = identifier.getSessionId(); + + try { + if (StringUtils.isNullOrWhitespaceOnly(identifier.getSessionId())) { + this.uploadSession = + tunnel.createUploadSession( + identifier.getProject(), + identifier.getSchema(), + identifier.getTable(), + new PartitionSpec(partitionSpec), + false); + } else { + this.uploadSession = + tunnel.getUploadSession( + identifier.getProject(), + identifier.getSchema(), + identifier.getTable(), + new PartitionSpec(partitionSpec), + sessionId); + } + this.recordWriter = + uploadSession.openBufferedWriter( + MaxComputeUtils.compressOptionOf(writeOptions.getCompressAlgorithm())); + } catch (OdpsException e) { + throw new UncheckedOdpsException(e); + } + } + + @Override + public SessionIdentifier getSessionIdentifier() { + return sessionIdentifier; + } + + @Override + public ArrayRecord newElement() { + return (ArrayRecord) uploadSession.newRecord(); + } + + @Override + public void write(ArrayRecord record) throws IOException { + recordWriter.write(record); + } + + @Override + public void delete(ArrayRecord record) throws IOException { + // append writer does not support delete. just ignore delete operation. + } + + @Override + public void flush() throws IOException { + ((TunnelBufferedWriter) recordWriter).flush(); + } + + @Override + public String getId() { + return uploadSession.getId(); + } + + @Override + public void commit() throws IOException { + try { + uploadSession.commit(); + } catch (TunnelException e) { + throw new IOException(e); + } + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/writer/BatchUpsertWriter.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/writer/BatchUpsertWriter.java new file mode 100644 index 0000000000..731e6139c9 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/writer/BatchUpsertWriter.java @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.writer; + +import org.apache.flink.cdc.connectors.maxcompute.common.SessionIdentifier; +import org.apache.flink.cdc.connectors.maxcompute.common.UncheckedOdpsException; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeWriteOptions; +import org.apache.flink.cdc.connectors.maxcompute.utils.MaxComputeUtils; + +import com.aliyun.odps.OdpsException; +import com.aliyun.odps.data.ArrayRecord; +import com.aliyun.odps.tunnel.TableTunnel; +import com.aliyun.odps.tunnel.TunnelException; +import com.aliyun.odps.tunnel.impl.UpsertSessionImpl; +import com.aliyun.odps.tunnel.impl.UpsertSessionImpl.Builder; +import com.aliyun.odps.tunnel.streams.UpsertStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + +/** + * MaxCompute upsert writer, use {@link UpsertSessionImpl} and {@link UpsertStream} to write data. + * Each session corresponds to a stream. + */ +public class BatchUpsertWriter implements MaxComputeWriter { + + private static final Logger LOG = LoggerFactory.getLogger(BatchUpsertWriter.class); + + private final MaxComputeOptions options; + private final MaxComputeWriteOptions writeOptions; + private final SessionIdentifier sessionIdentifier; + private final TableTunnel tunnel; + private UpsertSessionImpl upsertSession; + private UpsertStream upsertStream; + + public BatchUpsertWriter( + MaxComputeOptions options, + MaxComputeWriteOptions writeOptions, + SessionIdentifier sessionIdentifier) + throws IOException { + this.options = options; + this.writeOptions = writeOptions; + + this.tunnel = MaxComputeUtils.getTunnel(options, writeOptions); + this.sessionIdentifier = sessionIdentifier; + + initOrReloadSession(sessionIdentifier); + } + + private void initOrReloadSession(SessionIdentifier identifier) throws IOException { + String partitionSpec = identifier.getPartitionName(); + String sessionId = identifier.getSessionId(); + try { + this.upsertSession = + ((Builder) + tunnel.buildUpsertSession( + identifier.getProject(), identifier.getTable())) + .setConfig(tunnel.getConfig()) + .setSchemaName(identifier.getSchema()) + .setPartitionSpec(partitionSpec) + .setUpsertId(sessionId) + .setConcurrentNum(writeOptions.getFlushConcurrent()) + .build(); + this.upsertStream = + upsertSession + .buildUpsertStream() + .setListener(new UpsertStreamListener(upsertSession)) + .setMaxBufferSize(writeOptions.getMaxBufferSize()) + .setSlotBufferSize(writeOptions.getSlotBufferSize()) + .setCompressOption( + MaxComputeUtils.compressOptionOf( + writeOptions.getCompressAlgorithm())) + .build(); + } catch (OdpsException e) { + throw new UncheckedOdpsException(e); + } + } + + @Override + public SessionIdentifier getSessionIdentifier() { + return sessionIdentifier; + } + + @Override + public ArrayRecord newElement() { + return (ArrayRecord) upsertSession.newRecord(); + } + + @Override + public void write(ArrayRecord record) throws IOException { + try { + upsertStream.upsert(record); + } catch (OdpsException e) { + throw new IOException(e.getMessage() + "RequestId: " + e.getRequestId(), e); + } + } + + @Override + public void delete(ArrayRecord record) throws IOException { + try { + upsertStream.delete(record); + } catch (OdpsException e) { + throw new IOException(e.getMessage() + "RequestId: " + e.getRequestId(), e); + } + } + + @Override + public void flush() throws IOException { + try { + upsertStream.flush(); + } catch (OdpsException e) { + throw new IOException(e.getMessage() + "RequestId: " + e.getRequestId(), e); + } + } + + @Override + public String getId() { + return upsertSession.getId(); + } + + @Override + public void commit() throws IOException { + try { + upsertSession.commit(false); + upsertSession.close(); + } catch (OdpsException e) { + throw new IOException(e.getMessage() + "RequestId: " + e.getRequestId(), e); + } + } + + static class UpsertStreamListener extends UpsertSessionImpl.DefaultUpsertSteamListener { + + public UpsertStreamListener(UpsertSessionImpl session) { + super(session); + } + + @Override + public void onFlush(UpsertStream.FlushResult result) { + // metrics here + LOG.info( + "Flush success, trace id: {}, time: {}, record: {}", + result.traceId, + result.flushTime, + result.recordCount); + } + + @Override + public boolean onFlushFail(Exception error, int retry) { + LOG.error( + "Flush failed error {}, requestId {}", + error.getMessage(), + error instanceof TunnelException + ? ((TunnelException) error).getRequestId() + : ""); + return super.onFlushFail(error, retry); + } + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/writer/MaxComputeWriter.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/writer/MaxComputeWriter.java new file mode 100644 index 0000000000..dcac8500b2 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/java/org/apache/flink/cdc/connectors/maxcompute/writer/MaxComputeWriter.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.writer; + +import org.apache.flink.cdc.connectors.maxcompute.common.SessionIdentifier; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeWriteOptions; +import org.apache.flink.cdc.connectors.maxcompute.utils.MaxComputeUtils; + +import com.aliyun.odps.data.ArrayRecord; + +import java.io.IOException; + +/** the interface of all writer to write {@link ArrayRecord} to maxcompute. */ +public interface MaxComputeWriter { + + static MaxComputeWriter batchWriter( + MaxComputeOptions options, + MaxComputeWriteOptions writeOptions, + SessionIdentifier sessionIdentifier) + throws IOException { + if (MaxComputeUtils.isTransactionalTable(options, sessionIdentifier)) { + return new BatchUpsertWriter(options, writeOptions, sessionIdentifier); + } else { + return new BatchAppendWriter(options, writeOptions, sessionIdentifier); + } + } + + SessionIdentifier getSessionIdentifier(); + + ArrayRecord newElement(); + + void write(ArrayRecord record) throws IOException; + + void delete(ArrayRecord record) throws IOException; + + void flush() throws IOException; + + void commit() throws IOException; + + String getId(); +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/resources/META-INF/services/org.apache.flink.cdc.common.factories.Factory b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/resources/META-INF/services/org.apache.flink.cdc.common.factories.Factory new file mode 100644 index 0000000000..947fe6ce93 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/main/resources/META-INF/services/org.apache.flink.cdc.common.factories.Factory @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.flink.cdc.connectors.maxcompute.MaxComputeDataSinkFactory + diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/java/org/apache/flink/cdc/connectors/maxcompute/EmulatorTestBase.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/java/org/apache/flink/cdc/connectors/maxcompute/EmulatorTestBase.java new file mode 100644 index 0000000000..6d40c3c41b --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/java/org/apache/flink/cdc/connectors/maxcompute/EmulatorTestBase.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute; + +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.utils.MaxComputeUtils; + +import com.aliyun.odps.Odps; +import org.junit.ClassRule; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.wait.strategy.Wait; +import org.testcontainers.utility.DockerImageName; + +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.InetAddress; +import java.net.URL; +import java.net.UnknownHostException; + +/** init maxcompute-emulator use for e2e test. */ +public class EmulatorTestBase { + public static final DockerImageName MAXCOMPUTE_IMAGE = + DockerImageName.parse("maxcompute/maxcompute-emulator:v0.0.4"); + + @ClassRule + public static GenericContainer maxcompute = + new GenericContainer<>(MAXCOMPUTE_IMAGE) + .withExposedPorts(8080) + .waitingFor( + Wait.forLogMessage(".*Started MaxcomputeEmulatorApplication.*\\n", 1)) + .withLogConsumer(frame -> System.out.print(frame.getUtf8String())); + + public final MaxComputeOptions testOptions = + MaxComputeOptions.builder("ak", "sk", getEndpoint(), "mocked_mc").build(); + + public final Odps odps = MaxComputeUtils.getOdps(testOptions); + + private String getEndpoint() { + maxcompute.start(); + + String ip; + if (maxcompute.getHost().equals("localhost")) { + try { + ip = InetAddress.getLocalHost().getHostAddress(); + } catch (UnknownHostException e) { + ip = "127.0.0.1"; + } + } else { + ip = maxcompute.getHost(); + } + String endpoint = "http://" + ip + ":" + maxcompute.getFirstMappedPort(); + sendPOST(endpoint + "/init", endpoint); + return endpoint; + } + + public static void sendPOST(String postUrl, String postData) { + try { + URL url = new URL(postUrl); + + HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection(); + httpURLConnection.setRequestMethod("POST"); + httpURLConnection.setDoOutput(true); + httpURLConnection.setRequestProperty("Content-Type", "application/json"); + httpURLConnection.setRequestProperty( + "Content-Length", String.valueOf(postData.length())); + + try (OutputStream outputStream = httpURLConnection.getOutputStream()) { + outputStream.write(postData.getBytes("UTF-8")); + outputStream.flush(); + } + int responseCode = httpURLConnection.getResponseCode(); + if (responseCode != HttpURLConnection.HTTP_OK) { + throw new RuntimeException( + "POST request failed with response code: " + responseCode); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/java/org/apache/flink/cdc/connectors/maxcompute/utils/SchemaEvolutionUtilsTest.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/java/org/apache/flink/cdc/connectors/maxcompute/utils/SchemaEvolutionUtilsTest.java new file mode 100644 index 0000000000..4c600189b3 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/java/org/apache/flink/cdc/connectors/maxcompute/utils/SchemaEvolutionUtilsTest.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.utils; + +import org.apache.flink.cdc.common.event.AddColumnEvent; +import org.apache.flink.cdc.common.event.TableId; +import org.apache.flink.cdc.common.schema.Column; +import org.apache.flink.cdc.common.schema.Schema; +import org.apache.flink.cdc.common.types.DataTypes; +import org.apache.flink.cdc.connectors.maxcompute.EmulatorTestBase; + +import org.apache.flink.shaded.guava31.com.google.common.collect.ImmutableList; +import org.apache.flink.shaded.guava31.com.google.common.collect.ImmutableMap; + +import com.aliyun.odps.OdpsException; +import com.aliyun.odps.TableSchema; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** + * e2e test of SchemaEvolutionUtils, Note that the Emulator only supports uppercase input (However, + * MaxCompute can correctly distinguish between uppercase and lowercase). + * + *

Since the emulator does not support alter column type, the test cases here are mainly for + * testing other schema evolution logic. + */ +class SchemaEvolutionUtilsTest extends EmulatorTestBase { + + private static final String TEST_TABLE = "SCHEMA_EVOLUTION_TEST_TABLE"; + + @BeforeEach + void testCreateTable() { + try { + SchemaEvolutionUtils.createTable( + testOptions, + TableId.tableId(TEST_TABLE), + Schema.newBuilder() + .physicalColumn("PK", DataTypes.BIGINT()) + .physicalColumn("ID1", DataTypes.BIGINT()) + .physicalColumn("ID2", DataTypes.BIGINT()) + .primaryKey("PK") + .build()); + Assertions.assertEquals( + ImmutableList.of("PK"), odps.tables().get(TEST_TABLE).getPrimaryKey()); + } catch (Exception e) { + Assertions.fail(e.getMessage()); + } + } + + @AfterEach + void deleteTable() throws OdpsException { + odps.tables().delete(TEST_TABLE, true); + } + + @Test + void testAddColumn() throws OdpsException { + try { + SchemaEvolutionUtils.addColumns( + testOptions, + TableId.tableId(TEST_TABLE), + ImmutableList.of( + new AddColumnEvent.ColumnWithPosition( + Column.physicalColumn("ID3", DataTypes.BIGINT())), + new AddColumnEvent.ColumnWithPosition( + Column.physicalColumn("NAME", DataTypes.STRING())))); + TableSchema schema = odps.tables().get(TEST_TABLE).getSchema(); + + Assertions.assertEquals(5, schema.getColumns().size()); + Assertions.assertEquals("PK", schema.getColumns().get(0).getName()); + Assertions.assertEquals("ID1", schema.getColumns().get(1).getName()); + Assertions.assertEquals("ID2", schema.getColumns().get(2).getName()); + Assertions.assertEquals("ID3", schema.getColumns().get(3).getName()); + Assertions.assertEquals("NAME", schema.getColumns().get(4).getName()); + } catch (Exception e) { + Assertions.fail(e.getMessage()); + } + } + + @Test + void testDropColumn() throws OdpsException { + try { + SchemaEvolutionUtils.dropColumn( + testOptions, TableId.tableId(TEST_TABLE), ImmutableList.of("ID1", "ID2")); + TableSchema schema = odps.tables().get(TEST_TABLE).getSchema(); + + Assertions.assertEquals(1, schema.getColumns().size()); + Assertions.assertEquals("PK", schema.getColumns().get(0).getName()); + } catch (Exception e) { + Assertions.fail(e.getMessage()); + } + } + + @Test + void testRenameColumn() { + try { + TableSchema originSchema = odps.tables().get(TEST_TABLE).getSchema(); + Assertions.assertEquals("ID1", originSchema.getColumns().get(1).getName()); + Assertions.assertEquals("ID2", originSchema.getColumns().get(2).getName()); + + SchemaEvolutionUtils.renameColumn( + testOptions, + TableId.tableId(TEST_TABLE), + ImmutableMap.of("ID1", "ID1_NEW", "ID2", "ID2_NEW")); + + TableSchema expectSchema = odps.tables().get(TEST_TABLE).getSchema(); + Assertions.assertEquals("ID1_NEW", expectSchema.getColumns().get(1).getName()); + Assertions.assertEquals("ID2_NEW", expectSchema.getColumns().get(2).getName()); + } catch (Exception e) { + Assertions.fail(e.getMessage()); + } + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/java/org/apache/flink/cdc/connectors/maxcompute/utils/SessionCommitCoordinateHelperTest.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/java/org/apache/flink/cdc/connectors/maxcompute/utils/SessionCommitCoordinateHelperTest.java new file mode 100644 index 0000000000..f1bef59159 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/java/org/apache/flink/cdc/connectors/maxcompute/utils/SessionCommitCoordinateHelperTest.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.utils; + +import org.apache.flink.cdc.connectors.maxcompute.common.Constant; + +import org.junit.Assert; +import org.junit.Test; + +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; + +/** */ +public class SessionCommitCoordinateHelperTest { + + @Test + public void test() throws ExecutionException, InterruptedException { + SessionCommitCoordinateHelper sessionCommitCoordinateHelper = + new SessionCommitCoordinateHelper(4); + sessionCommitCoordinateHelper.clear(); + ExecutorService executorService = Executors.newFixedThreadPool(5); + + Future future = + executorService.submit( + () -> { + int expect = 1; + while (sessionCommitCoordinateHelper.isCommitting()) { + try { + String toCommitSessionId = + sessionCommitCoordinateHelper.getToCommitSessionId(); + if (toCommitSessionId != null) { + Assert.assertEquals( + expect, Integer.parseInt(toCommitSessionId)); + expect++; + } + Thread.sleep(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + }); + + executorService.submit( + () -> { + try { + Thread.sleep(3000); + sessionCommitCoordinateHelper.commit(0, "1"); + Thread.sleep(5000); + sessionCommitCoordinateHelper.commit(0, Constant.END_OF_SESSION); + } catch (InterruptedException e) { + e.printStackTrace(); + } + }); + executorService.submit( + () -> { + try { + Thread.sleep(2000); + sessionCommitCoordinateHelper.commit(1, "1"); + Thread.sleep(5000); + sessionCommitCoordinateHelper.commit(1, "2"); + Thread.sleep(1000); + sessionCommitCoordinateHelper.commit(1, Constant.END_OF_SESSION); + } catch (InterruptedException e) { + e.printStackTrace(); + } + }); + executorService.submit( + () -> { + try { + Thread.sleep(4000); + sessionCommitCoordinateHelper.commit(2, "2"); + Thread.sleep(3000); + sessionCommitCoordinateHelper.commit(2, "3"); + Thread.sleep(2000); + sessionCommitCoordinateHelper.commit(2, Constant.END_OF_SESSION); + } catch (InterruptedException e) { + e.printStackTrace(); + } + }); + executorService.submit( + () -> { + try { + Thread.sleep(2000); + sessionCommitCoordinateHelper.commit(3, "1"); + Thread.sleep(2000); + sessionCommitCoordinateHelper.commit(3, "2"); + Thread.sleep(2000); + sessionCommitCoordinateHelper.commit(3, "3"); + sessionCommitCoordinateHelper.commit(3, Constant.END_OF_SESSION); + } catch (InterruptedException e) { + e.printStackTrace(); + } + }); + + future.get(); + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/java/org/apache/flink/cdc/connectors/maxcompute/utils/TypeConvertUtilsTest.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/java/org/apache/flink/cdc/connectors/maxcompute/utils/TypeConvertUtilsTest.java new file mode 100644 index 0000000000..09c1b01112 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/java/org/apache/flink/cdc/connectors/maxcompute/utils/TypeConvertUtilsTest.java @@ -0,0 +1,185 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.connectors.maxcompute.utils; + +import org.apache.flink.cdc.common.data.DecimalData; +import org.apache.flink.cdc.common.data.LocalZonedTimestampData; +import org.apache.flink.cdc.common.data.TimestampData; +import org.apache.flink.cdc.common.data.ZonedTimestampData; +import org.apache.flink.cdc.common.data.binary.BinaryRecordData; +import org.apache.flink.cdc.common.data.binary.BinaryStringData; +import org.apache.flink.cdc.common.schema.Schema; +import org.apache.flink.cdc.common.types.DataTypes; +import org.apache.flink.cdc.common.types.RowType; +import org.apache.flink.cdc.runtime.typeutils.BinaryRecordDataGenerator; + +import org.apache.flink.shaded.guava31.com.google.common.collect.ImmutableList; + +import com.aliyun.odps.Column; +import com.aliyun.odps.TableSchema; +import com.aliyun.odps.data.ArrayRecord; +import com.aliyun.odps.type.TypeInfoFactory; +import org.junit.Assert; +import org.junit.Test; + +import java.sql.Timestamp; +import java.time.Instant; +import java.time.ZoneId; +import java.time.ZonedDateTime; +import java.util.List; + +/** test for TypeConvertUtils. */ +public class TypeConvertUtilsTest { + static Schema allTypeSchema = + Schema.newBuilder() + .physicalColumn("char(5)", DataTypes.CHAR(5)) + .physicalColumn("varchar(10)", DataTypes.VARCHAR(10)) + .physicalColumn("string", DataTypes.STRING()) + .physicalColumn("boolean", DataTypes.BOOLEAN()) + .physicalColumn("binary(5)", DataTypes.BINARY(5)) + .physicalColumn("varbinary(10)", DataTypes.BINARY(10)) + .physicalColumn("decimal(10, 2)", DataTypes.DECIMAL(10, 2)) + .physicalColumn("tinyint", DataTypes.TINYINT()) + .physicalColumn("smallint", DataTypes.SMALLINT()) + .physicalColumn("int", DataTypes.INT()) + .physicalColumn("bigint", DataTypes.BIGINT()) + .physicalColumn("float", DataTypes.FLOAT()) + .physicalColumn("double", DataTypes.DOUBLE()) + .physicalColumn("time", DataTypes.TIME()) + .physicalColumn("date", DataTypes.DATE()) + .physicalColumn("timestamp", DataTypes.TIMESTAMP()) + .physicalColumn("timestamp_ltz", DataTypes.TIMESTAMP_LTZ()) + .physicalColumn("timestamp_tz", DataTypes.TIMESTAMP_TZ()) + .physicalColumn( + "array>", DataTypes.ARRAY(DataTypes.ARRAY(DataTypes.INT()))) + .physicalColumn( + "map, int>", + DataTypes.MAP( + DataTypes.MAP(DataTypes.INT(), DataTypes.INT()), + DataTypes.INT())) + .physicalColumn( + "row, int, int>", + DataTypes.ROW( + DataTypes.FIELD( + "f0", DataTypes.MAP(DataTypes.INT(), DataTypes.INT())), + DataTypes.FIELD("f1", DataTypes.INT()), + DataTypes.FIELD("f2", DataTypes.INT()))) + .build(); + + @Test + public void schemaConvertTest() { + TableSchema maxComputeSchema = TypeConvertUtils.toMaxCompute(allTypeSchema); + + TableSchema expectSchema = new TableSchema(); + expectSchema.addColumn(new Column("char(5)", TypeInfoFactory.STRING)); + expectSchema.addColumn(new Column("varchar(10)", TypeInfoFactory.STRING)); + expectSchema.addColumn(new Column("string", TypeInfoFactory.STRING)); + expectSchema.addColumn(new Column("boolean", TypeInfoFactory.BOOLEAN)); + expectSchema.addColumn(new Column("binary(5)", TypeInfoFactory.BINARY)); + expectSchema.addColumn(new Column("varbinary(10)", TypeInfoFactory.BINARY)); + expectSchema.addColumn( + new Column("decimal(10, 2)", TypeInfoFactory.getDecimalTypeInfo(10, 2))); + expectSchema.addColumn(new Column("tinyint", TypeInfoFactory.TINYINT)); + expectSchema.addColumn(new Column("smallint", TypeInfoFactory.SMALLINT)); + expectSchema.addColumn(new Column("int", TypeInfoFactory.INT)); + expectSchema.addColumn(new Column("bigint", TypeInfoFactory.BIGINT)); + expectSchema.addColumn(new Column("float", TypeInfoFactory.FLOAT)); + expectSchema.addColumn(new Column("double", TypeInfoFactory.DOUBLE)); + expectSchema.addColumn(new Column("time", TypeInfoFactory.STRING)); + expectSchema.addColumn(new Column("date", TypeInfoFactory.DATE)); + expectSchema.addColumn(new Column("timestamp", TypeInfoFactory.TIMESTAMP_NTZ)); + expectSchema.addColumn(new Column("timestamp_ltz", TypeInfoFactory.TIMESTAMP)); + expectSchema.addColumn(new Column("timestamp_tz", TypeInfoFactory.TIMESTAMP)); + expectSchema.addColumn( + new Column( + "array>", + TypeInfoFactory.getArrayTypeInfo( + TypeInfoFactory.getArrayTypeInfo(TypeInfoFactory.INT)))); + expectSchema.addColumn( + new Column( + "map, int>", + TypeInfoFactory.getMapTypeInfo( + TypeInfoFactory.getMapTypeInfo( + TypeInfoFactory.INT, TypeInfoFactory.INT), + TypeInfoFactory.INT))); + expectSchema.addColumn( + new Column( + "row, int, int>", + TypeInfoFactory.getStructTypeInfo( + ImmutableList.of("f0", "f1", "f2"), + ImmutableList.of( + TypeInfoFactory.getMapTypeInfo( + TypeInfoFactory.INT, TypeInfoFactory.INT), + TypeInfoFactory.INT, + TypeInfoFactory.INT)))); + + List expect = expectSchema.getAllColumns(); + List current = maxComputeSchema.getAllColumns(); + + for (int i = 0; i < expect.size(); i++) { + Assert.assertEquals( + expect.get(i).getTypeInfo().getTypeName(), + current.get(i).getTypeInfo().getTypeName()); + Assert.assertEquals(expect.get(i).getName(), current.get(i).getName()); + } + } + + @Test + public void testRecordConvert() { + Schema schemaWithoutComplexType = + allTypeSchema.copy( + allTypeSchema.getColumns().stream() + .limit(18) + .collect(ImmutableList.toImmutableList())); + BinaryRecordDataGenerator dataGenerator = + new BinaryRecordDataGenerator((RowType) schemaWithoutComplexType.toRowDataType()); + BinaryRecordData record1 = + dataGenerator.generate( + new Object[] { + BinaryStringData.fromString("char"), + BinaryStringData.fromString("varchar"), + BinaryStringData.fromString("string"), + false, + new byte[] {1, 2, 3, 4, 5}, + new byte[] {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, + DecimalData.zero(10, 2), + (byte) 1, + (short) 2, + 12345, + 12345L, + 123.456f, + 123456.789d, + 12345, + 12345, + TimestampData.fromTimestamp(Timestamp.from(Instant.ofEpochSecond(0))), + LocalZonedTimestampData.fromInstant(Instant.ofEpochSecond(0)), + ZonedTimestampData.fromZonedDateTime( + ZonedDateTime.ofInstant( + Instant.ofEpochSecond(0), ZoneId.of("Asia/Shanghai"))), + }); + + ArrayRecord arrayRecord = + new ArrayRecord(TypeConvertUtils.toMaxCompute(schemaWithoutComplexType)); + TypeConvertUtils.toMaxComputeRecord(schemaWithoutComplexType, record1, arrayRecord); + + String expect = + "char,varchar,string,false,=01=02=03=04=05,=01=02=03=04=05=06=07=08=09=0A,0.00,1,2,12345,12345,123.456,123456.789,00:00:00.012345,2003-10-20,1970-01-01T08:00,1970-01-01T00:00:00Z,1970-01-01T00:00:00Z"; + Assert.assertEquals(expect, arrayRecord.toString()); + } +} diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/resources/log4j2-test.properties b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/resources/log4j2-test.properties new file mode 100644 index 0000000000..f0d32fb590 --- /dev/null +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-maxcompute/src/test/resources/log4j2-test.properties @@ -0,0 +1,25 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# Set root logger level to OFF to not flood build logs +# set manually to INFO for debugging purposes +rootLogger.level=INFO +rootLogger.appenderRef.test.ref = TestLogger + +appender.testlogger.name = TestLogger +appender.testlogger.type = CONSOLE +appender.testlogger.target = SYSTEM_ERR +appender.testlogger.layout.type = PatternLayout +appender.testlogger.layout.pattern = %-4r [%t] %-5p %c - %m%n diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/pom.xml b/flink-cdc-connect/flink-cdc-pipeline-connectors/pom.xml index 424372f4c1..c3e64d708c 100644 --- a/flink-cdc-connect/flink-cdc-pipeline-connectors/pom.xml +++ b/flink-cdc-connect/flink-cdc-pipeline-connectors/pom.xml @@ -34,6 +34,7 @@ limitations under the License. flink-cdc-pipeline-connector-starrocks flink-cdc-pipeline-connector-kafka flink-cdc-pipeline-connector-paimon + flink-cdc-pipeline-connector-maxcompute diff --git a/flink-cdc-e2e-tests/flink-cdc-pipeline-e2e-tests/pom.xml b/flink-cdc-e2e-tests/flink-cdc-pipeline-e2e-tests/pom.xml index 2326240b6d..2d44cabd39 100644 --- a/flink-cdc-e2e-tests/flink-cdc-pipeline-e2e-tests/pom.xml +++ b/flink-cdc-e2e-tests/flink-cdc-pipeline-e2e-tests/pom.xml @@ -99,6 +99,12 @@ limitations under the License. test-jar test + + org.apache.flink + flink-cdc-pipeline-connector-maxcompute + ${project.version} + test + org.apache.flink flink-connector-test-util @@ -107,6 +113,12 @@ limitations under the License. + + org.testcontainers + testcontainers + ${testcontainers.version} + test + org.testcontainers mysql @@ -231,6 +243,16 @@ limitations under the License. ${project.build.directory}/dependencies + + + org.apache.flink + flink-cdc-pipeline-connector-maxcompute + ${project.version} + maxcompute-cdc-pipeline-connector.jar + jar + ${project.build.directory}/dependencies + + diff --git a/flink-cdc-e2e-tests/flink-cdc-pipeline-e2e-tests/src/test/java/org/apache/flink/cdc/pipeline/tests/MaxComputeE2eITCase.java b/flink-cdc-e2e-tests/flink-cdc-pipeline-e2e-tests/src/test/java/org/apache/flink/cdc/pipeline/tests/MaxComputeE2eITCase.java new file mode 100644 index 0000000000..cea874ef93 --- /dev/null +++ b/flink-cdc-e2e-tests/flink-cdc-pipeline-e2e-tests/src/test/java/org/apache/flink/cdc/pipeline/tests/MaxComputeE2eITCase.java @@ -0,0 +1,193 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.flink.cdc.pipeline.tests; + +import org.apache.flink.api.common.JobStatus; +import org.apache.flink.api.common.time.Deadline; +import org.apache.flink.cdc.common.test.utils.TestUtils; +import org.apache.flink.cdc.connectors.maxcompute.options.MaxComputeOptions; +import org.apache.flink.cdc.connectors.maxcompute.utils.MaxComputeUtils; +import org.apache.flink.cdc.pipeline.tests.utils.PipelineTestEnvironment; +import org.apache.flink.client.program.rest.RestClusterClient; +import org.apache.flink.runtime.client.JobStatusMessage; +import org.apache.flink.table.api.ValidationException; + +import com.aliyun.odps.Instance; +import com.aliyun.odps.Odps; +import com.aliyun.odps.data.Record; +import com.aliyun.odps.task.SQLTask; +import org.junit.Assert; +import org.junit.ClassRule; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.wait.strategy.Wait; +import org.testcontainers.utility.DockerImageName; + +import java.io.OutputStream; +import java.net.HttpURLConnection; +import java.net.InetAddress; +import java.net.URL; +import java.net.UnknownHostException; +import java.nio.file.Path; +import java.time.Duration; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.TimeUnit; + +/** End-to-end tests for maxcompute cdc pipeline job. */ +public class MaxComputeE2eITCase extends PipelineTestEnvironment { + private static final Logger LOG = LoggerFactory.getLogger(MaxComputeE2eITCase.class); + + public static final DockerImageName MAXCOMPUTE_IMAGE = + DockerImageName.parse("maxcompute/maxcompute-emulator:v0.0.4"); + + @ClassRule + public static GenericContainer maxcompute = + new GenericContainer<>(MAXCOMPUTE_IMAGE) + .withExposedPorts(8080) + .waitingFor( + Wait.forLogMessage(".*Started MaxcomputeEmulatorApplication.*\\n", 1)) + .withLogConsumer(frame -> System.out.print(frame.getUtf8String())); + + public final MaxComputeOptions testOptions = + MaxComputeOptions.builder("ak", "sk", getEndpoint(), "mocked_mc") + .withTunnelEndpoint(getEndpoint()) + .build(); + + @Test + public void testSingleSplitSingleTable() throws Exception { + startTest("SINGLE_SPLIT_SINGLE_TABLE"); + Instance instance = + SQLTask.run( + MaxComputeUtils.getOdps(testOptions), + "select * from table1 order by col1;"); + instance.waitForSuccess(); + List result = SQLTask.getResult(instance); + System.out.println(result); + Assert.assertEquals(2, result.size()); + // 2,x + Assert.assertEquals("2", result.get(0).get(0)); + Assert.assertEquals("x", result.get(0).get(1)); + // 3, NULL (MaxCompute Emulator use 'NULL' instead of null) + Assert.assertEquals("3", result.get(1).get(0)); + Assert.assertEquals("NULL", result.get(1).get(1)); + } + + private void startTest(String testSet) throws Exception { + sendPOST(getEndpoint() + "/init", getEndpoint()); + + Odps odps = MaxComputeUtils.getOdps(testOptions); + odps.tables().delete("table1", true); + odps.tables().delete("table2", true); + + String pipelineJob = + "source:\n" + + " type: values\n" + + " name: ValuesSource\n" + + " event-set.id: " + + testSet + + "\n" + + "\n" + + "sink:\n" + + " type: maxcompute\n" + + " name: MaxComputeSink\n" + + " accessId: ak\n" + + " accessKey: sk\n" + + " endpoint: " + + getEndpoint() + + "\n" + + " tunnelEndpoint: " + + getEndpoint() + + "\n" + + " project: mocked_mc\n" + + " bucketSize: 8\n" + + " compressAlgorithm: raw\n" + + " maxSessionParallelism: 2\n" + + "\n" + + "pipeline:\n" + + " parallelism: 4"; + Path maxcomputeCdcJar = TestUtils.getResource("maxcompute-cdc-pipeline-connector.jar"); + Path valuesCdcJar = TestUtils.getResource("values-cdc-pipeline-connector.jar"); + submitPipelineJob(pipelineJob, maxcomputeCdcJar, valuesCdcJar); + waitUntilJobFinished(Duration.ofMinutes(10)); + } + + private String getEndpoint() { + String ip; + if (maxcompute.getHost().equals("localhost")) { + try { + ip = InetAddress.getLocalHost().getHostAddress(); + } catch (UnknownHostException e) { + ip = "127.0.0.1"; + } + } else { + ip = maxcompute.getHost(); + } + return "http://" + ip + ":" + maxcompute.getFirstMappedPort(); + } + + public static void sendPOST(String postUrl, String postData) throws Exception { + URL url = new URL(postUrl); + + HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection(); + httpURLConnection.setRequestMethod("POST"); + httpURLConnection.setDoOutput(true); + httpURLConnection.setRequestProperty("Content-Type", "application/json"); + httpURLConnection.setRequestProperty("Content-Length", String.valueOf(postData.length())); + + try (OutputStream outputStream = httpURLConnection.getOutputStream()) { + outputStream.write(postData.getBytes("UTF-8")); + outputStream.flush(); + } + int responseCode = httpURLConnection.getResponseCode(); + if (responseCode != HttpURLConnection.HTTP_OK) { + throw new RuntimeException("POST request failed with response code: " + responseCode); + } + } + + public void waitUntilJobFinished(Duration timeout) { + RestClusterClient clusterClient = getRestClusterClient(); + Deadline deadline = Deadline.fromNow(timeout); + while (deadline.hasTimeLeft()) { + Collection jobStatusMessages; + try { + jobStatusMessages = clusterClient.listJobs().get(10, TimeUnit.SECONDS); + } catch (Exception e) { + LOG.warn("Error when fetching job status.", e); + continue; + } + if (jobStatusMessages != null && !jobStatusMessages.isEmpty()) { + JobStatusMessage message = jobStatusMessages.iterator().next(); + JobStatus jobStatus = message.getJobState(); + if (jobStatus.isTerminalState()) { + if (jobStatus == JobStatus.FINISHED) { + return; + } + throw new ValidationException( + String.format( + "Job has been terminated unexpectedly! JobName: %s, JobID: %s, Status: %s", + message.getJobName(), + message.getJobId(), + message.getJobState())); + } + } + } + } +}