From 492b8d72b54e9be7b8d35d9947007afa7578d1a9 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 17 Jan 2023 20:56:02 +0800 Subject: [PATCH 01/13] ticdc: add scale out for kafka changefeed Signed-off-by: Neil Shen --- ticdc/ticdc-sink-to-kafka.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/ticdc/ticdc-sink-to-kafka.md b/ticdc/ticdc-sink-to-kafka.md index 815fe03054f6..74bcbbe6af0d 100644 --- a/ticdc/ticdc-sink-to-kafka.md +++ b/ticdc/ticdc-sink-to-kafka.md @@ -233,3 +233,19 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index > ``` > {matcher = ['*.*'], dispatcher = "ts", partition = "table"}, > ``` + +### 将大单表的同步负载水平扩张到多个 TiCDC 节点上 + +该功能可以解决单个 TiCDC 节点不能及时同步大单表的问题和 TiCDC 节点之间资源(CPU,内存等)消耗出现倾斜的问题。 +该功能通过将大单表按 Region 个数切分成多个数据范围,将这些数据范围分布到多个 TiCDC 节点上,使得多个 TiCDC 节点可以同时同步大单表。 + +配置样例如下所示: + +```shell +[scheduler] +region-per-span = 50000 +``` + +> **注意:** +> +> v6.6 的 TiCDC 仅支持在 Kafka 同步任务上开启大单表的水平扩展功能。 From 19c3a5656d8ddb86409edf9febd5662eee9314ad Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 17 Jan 2023 21:04:20 +0800 Subject: [PATCH 02/13] break lines Signed-off-by: Neil Shen --- ticdc/ticdc-sink-to-kafka.md | 1 + 1 file changed, 1 insertion(+) diff --git a/ticdc/ticdc-sink-to-kafka.md b/ticdc/ticdc-sink-to-kafka.md index 74bcbbe6af0d..ba64dacc8a65 100644 --- a/ticdc/ticdc-sink-to-kafka.md +++ b/ticdc/ticdc-sink-to-kafka.md @@ -237,6 +237,7 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index ### 将大单表的同步负载水平扩张到多个 TiCDC 节点上 该功能可以解决单个 TiCDC 节点不能及时同步大单表的问题和 TiCDC 节点之间资源(CPU,内存等)消耗出现倾斜的问题。 + 该功能通过将大单表按 Region 个数切分成多个数据范围,将这些数据范围分布到多个 TiCDC 节点上,使得多个 TiCDC 节点可以同时同步大单表。 配置样例如下所示: From cf745f8d4c8f1956d4155cf1d3068bb664788956 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 17 Jan 2023 21:05:28 +0800 Subject: [PATCH 03/13] increase head Signed-off-by: Neil Shen --- ticdc/ticdc-sink-to-kafka.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ticdc/ticdc-sink-to-kafka.md b/ticdc/ticdc-sink-to-kafka.md index ba64dacc8a65..7e6261a4e9fa 100644 --- a/ticdc/ticdc-sink-to-kafka.md +++ b/ticdc/ticdc-sink-to-kafka.md @@ -234,7 +234,7 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index > {matcher = ['*.*'], dispatcher = "ts", partition = "table"}, > ``` -### 将大单表的同步负载水平扩张到多个 TiCDC 节点上 +## 将大单表的同步负载水平扩张到多个 TiCDC 节点上 该功能可以解决单个 TiCDC 节点不能及时同步大单表的问题和 TiCDC 节点之间资源(CPU,内存等)消耗出现倾斜的问题。 From 1f4dd489267699fd49a0347ddbf4335cb3082568 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Tue, 17 Jan 2023 21:06:23 +0800 Subject: [PATCH 04/13] typo Signed-off-by: Neil Shen --- ticdc/ticdc-sink-to-kafka.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ticdc/ticdc-sink-to-kafka.md b/ticdc/ticdc-sink-to-kafka.md index 7e6261a4e9fa..21be1c7d583b 100644 --- a/ticdc/ticdc-sink-to-kafka.md +++ b/ticdc/ticdc-sink-to-kafka.md @@ -234,7 +234,7 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index > {matcher = ['*.*'], dispatcher = "ts", partition = "table"}, > ``` -## 将大单表的同步负载水平扩张到多个 TiCDC 节点上 +## 将大单表的同步负载水平扩展到多个 TiCDC 节点上 该功能可以解决单个 TiCDC 节点不能及时同步大单表的问题和 TiCDC 节点之间资源(CPU,内存等)消耗出现倾斜的问题。 From 701403b3b6635a96090730ba662367619eb65fe4 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Sat, 28 Jan 2023 16:30:55 +0800 Subject: [PATCH 05/13] Apply suggestions from code review Co-authored-by: Aolin --- ticdc/ticdc-sink-to-kafka.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ticdc/ticdc-sink-to-kafka.md b/ticdc/ticdc-sink-to-kafka.md index 21be1c7d583b..1ffb264bbe90 100644 --- a/ticdc/ticdc-sink-to-kafka.md +++ b/ticdc/ticdc-sink-to-kafka.md @@ -236,17 +236,17 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index ## 将大单表的同步负载水平扩展到多个 TiCDC 节点上 -该功能可以解决单个 TiCDC 节点不能及时同步大单表的问题和 TiCDC 节点之间资源(CPU,内存等)消耗出现倾斜的问题。 +该功能可以解决单个 TiCDC 节点不能及时同步大单表的问题和 TiCDC 节点之间资源(CPU、内存等)消耗出现倾斜的问题。 该功能通过将大单表按 Region 个数切分成多个数据范围,将这些数据范围分布到多个 TiCDC 节点上,使得多个 TiCDC 节点可以同时同步大单表。 配置样例如下所示: -```shell +```toml [scheduler] region-per-span = 50000 ``` > **注意:** > -> v6.6 的 TiCDC 仅支持在 Kafka 同步任务上开启大单表的水平扩展功能。 +> TiCDC v6.6.0 仅支持在 Kafka 同步任务上开启大单表的水平扩展功能。 From 19bb1c18182be8c92f704a01ba9b063973f1a12c Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Sat, 28 Jan 2023 16:33:08 +0800 Subject: [PATCH 06/13] Update ticdc/ticdc-sink-to-kafka.md --- ticdc/ticdc-sink-to-kafka.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ticdc/ticdc-sink-to-kafka.md b/ticdc/ticdc-sink-to-kafka.md index 1ffb264bbe90..5b376ba9723e 100644 --- a/ticdc/ticdc-sink-to-kafka.md +++ b/ticdc/ticdc-sink-to-kafka.md @@ -236,7 +236,10 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index ## 将大单表的同步负载水平扩展到多个 TiCDC 节点上 -该功能可以解决单个 TiCDC 节点不能及时同步大单表的问题和 TiCDC 节点之间资源(CPU、内存等)消耗出现倾斜的问题。 +该功能可以解决以下两个问题: + +- 单个 TiCDC 节点不能及时同步大单表的问题 +- TiCDC 节点之间资源(CPU、内存等)消耗不均匀的问题。 该功能通过将大单表按 Region 个数切分成多个数据范围,将这些数据范围分布到多个 TiCDC 节点上,使得多个 TiCDC 节点可以同时同步大单表。 From 1700fa1220bc8bd65fba8846876ee08b9acee964 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Sat, 28 Jan 2023 17:02:57 +0800 Subject: [PATCH 07/13] Update ticdc/ticdc-sink-to-kafka.md --- ticdc/ticdc-sink-to-kafka.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ticdc/ticdc-sink-to-kafka.md b/ticdc/ticdc-sink-to-kafka.md index 5b376ba9723e..9936722bfe2a 100644 --- a/ticdc/ticdc-sink-to-kafka.md +++ b/ticdc/ticdc-sink-to-kafka.md @@ -234,7 +234,7 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index > {matcher = ['*.*'], dispatcher = "ts", partition = "table"}, > ``` -## 将大单表的同步负载水平扩展到多个 TiCDC 节点上 +## 横向扩展大单表的负载到多个 TiCDC 节点 该功能可以解决以下两个问题: From a63df6a07c11a6273d0c77e5cd2dc82a624b8f97 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Sat, 28 Jan 2023 17:26:48 +0800 Subject: [PATCH 08/13] Apply suggestions from code review Co-authored-by: Aolin --- ticdc/ticdc-sink-to-kafka.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ticdc/ticdc-sink-to-kafka.md b/ticdc/ticdc-sink-to-kafka.md index 9936722bfe2a..9e608cb91647 100644 --- a/ticdc/ticdc-sink-to-kafka.md +++ b/ticdc/ticdc-sink-to-kafka.md @@ -240,6 +240,9 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index - 单个 TiCDC 节点不能及时同步大单表的问题 - TiCDC 节点之间资源(CPU、内存等)消耗不均匀的问题。 +> **注意:** +> +> TiCDC v6.6.0 仅支持在 Kafka 同步任务上开启大单表的水平扩展功能。 该功能通过将大单表按 Region 个数切分成多个数据范围,将这些数据范围分布到多个 TiCDC 节点上,使得多个 TiCDC 节点可以同时同步大单表。 @@ -249,7 +252,3 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index [scheduler] region-per-span = 50000 ``` - -> **注意:** -> -> TiCDC v6.6.0 仅支持在 Kafka 同步任务上开启大单表的水平扩展功能。 From 6c48ebfa608a798acca268bc34fb9c57809207ae Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Sun, 29 Jan 2023 14:51:15 +0800 Subject: [PATCH 09/13] Apply suggestions from code review Co-authored-by: shichun-0415 <89768198+shichun-0415@users.noreply.github.com> Co-authored-by: Aolin --- ticdc/ticdc-sink-to-kafka.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ticdc/ticdc-sink-to-kafka.md b/ticdc/ticdc-sink-to-kafka.md index 9e608cb91647..394f1d5d4ebc 100644 --- a/ticdc/ticdc-sink-to-kafka.md +++ b/ticdc/ticdc-sink-to-kafka.md @@ -238,8 +238,9 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index 该功能可以解决以下两个问题: -- 单个 TiCDC 节点不能及时同步大单表的问题 -- TiCDC 节点之间资源(CPU、内存等)消耗不均匀的问题。 +- 单个 TiCDC 节点不能及时同步大单表。 +- TiCDC 节点之间资源(CPU、内存等)消耗不均匀。 + > **注意:** > > TiCDC v6.6.0 仅支持在 Kafka 同步任务上开启大单表的水平扩展功能。 From bc59044cc6aba5fddfdd9e5d8aa63c3fe97dd1ea Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Sun, 29 Jan 2023 14:52:28 +0800 Subject: [PATCH 10/13] Apply suggestions from code review --- ticdc/ticdc-sink-to-kafka.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ticdc/ticdc-sink-to-kafka.md b/ticdc/ticdc-sink-to-kafka.md index 394f1d5d4ebc..7fbbcd934aab 100644 --- a/ticdc/ticdc-sink-to-kafka.md +++ b/ticdc/ticdc-sink-to-kafka.md @@ -236,7 +236,7 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index ## 横向扩展大单表的负载到多个 TiCDC 节点 -该功能可以解决以下两个问题: +该功能通过将大单表按 Region 个数切分成多个数据范围,将这些数据范围分布到多个 TiCDC 节点上,使得多个 TiCDC 节点可以同时同步大单表。该功能可以解决以下两个问题: - 单个 TiCDC 节点不能及时同步大单表。 - TiCDC 节点之间资源(CPU、内存等)消耗不均匀。 @@ -245,7 +245,6 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index > > TiCDC v6.6.0 仅支持在 Kafka 同步任务上开启大单表的水平扩展功能。 -该功能通过将大单表按 Region 个数切分成多个数据范围,将这些数据范围分布到多个 TiCDC 节点上,使得多个 TiCDC 节点可以同时同步大单表。 配置样例如下所示: From 3f676c29e3a6f339c95414d67ab14dc9b3e60316 Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Sun, 29 Jan 2023 17:13:57 +0800 Subject: [PATCH 11/13] Update ticdc/ticdc-sink-to-kafka.md --- ticdc/ticdc-sink-to-kafka.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/ticdc/ticdc-sink-to-kafka.md b/ticdc/ticdc-sink-to-kafka.md index 7fbbcd934aab..236c30313040 100644 --- a/ticdc/ticdc-sink-to-kafka.md +++ b/ticdc/ticdc-sink-to-kafka.md @@ -244,8 +244,6 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index > **注意:** > > TiCDC v6.6.0 仅支持在 Kafka 同步任务上开启大单表的水平扩展功能。 - - 配置样例如下所示: ```toml From 28fc122f48d2023a4319451da453efd924a7cddf Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Thu, 2 Feb 2023 19:52:42 +0800 Subject: [PATCH 12/13] Update ticdc/ticdc-sink-to-kafka.md Co-authored-by: Aolin --- ticdc/ticdc-sink-to-kafka.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ticdc/ticdc-sink-to-kafka.md b/ticdc/ticdc-sink-to-kafka.md index 236c30313040..dbf4f0814572 100644 --- a/ticdc/ticdc-sink-to-kafka.md +++ b/ticdc/ticdc-sink-to-kafka.md @@ -243,7 +243,8 @@ partition 分发器用 partition = "xxx" 来指定,支持 default、ts、index > **注意:** > -> TiCDC v6.6.0 仅支持在 Kafka 同步任务上开启大单表的水平扩展功能。 +> TiCDC v6.6.0 仅支持在 Kafka 同步任务上开启大单表的横向扩展功能。 + 配置样例如下所示: ```toml From 1d6b43d068a4d28c9f2b41d259c04434d30259bb Mon Sep 17 00:00:00 2001 From: Neil Shen Date: Thu, 2 Feb 2023 20:06:03 +0800 Subject: [PATCH 13/13] address comments Signed-off-by: Neil Shen --- ticdc/ticdc-changefeed-config.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ticdc/ticdc-changefeed-config.md b/ticdc/ticdc-changefeed-config.md index cb983ba013b6..05e412b5f719 100644 --- a/ticdc/ticdc-changefeed-config.md +++ b/ticdc/ticdc-changefeed-config.md @@ -78,7 +78,7 @@ matcher = ["test.worker"] # matcher 是一个白名单,表示该过滤规则 ignore-event = ["insert"] # 过滤掉 insert 事件 ignore-sql = ["^drop", "add column"] # 过滤掉以 "drop" 开头或者包含 "add column" 的 DDL ignore-delete-value-expr = "name = 'john'" # 过滤掉包含 name = 'john' 条件的 delete DML -ignore-insert-value-expr = "id >= 100" # 过滤掉包含 id >= 100 条件的 insert DML +ignore-insert-value-expr = "id >= 100" # 过滤掉包含 id >= 100 条件的 insert DML ignore-update-old-value-expr = "age < 18" # 过滤掉旧值 age < 18 的 update DML ignore-update-new-value-expr = "gender = 'male'" # 过滤掉新值 gender = 'male' 的 update DML @@ -89,6 +89,13 @@ ignore-event = ["drop table"] # 忽略 drop table 事件 ignore-sql = ["delete"] # 忽略 delete DML ignore-insert-value-expr = "price > 1000 and origin = 'no where'" # 忽略包含 price > 1000 和 origin = 'no where' 条件的 insert DML +[scheduler] +# 将表按 Region 个数划分成多个同步范围,这些范围可由多个 TiCDC 节点同步。 +# 注意: +# 1. 该参数只在 Kafka changefeed 上生效,暂不支持 MySQL changefeed。 +# 2. TiCDC 不会将小于该参数 Region 个数的表划分成多个同步范围。 +# region-per-span = 50000 + [sink] # 对于 MQ 类的 Sink,可以通过 dispatchers 配置 event 分发器 # 支持 partition 及 topic(从 v6.1 开始支持)两种 event 分发器。二者的详细说明见下一节。