From af6aa2bc89f42479229732a8e8c0f5243630b1a4 Mon Sep 17 00:00:00 2001 From: Aolin Date: Tue, 18 Apr 2023 15:42:04 +0800 Subject: [PATCH 01/13] cdc: the e2e checksum integrity check functionality Signed-off-by: Aolin --- TOC.md | 1 + system-variables.md | 8 +++++ ticdc/ticdc-changefeed-config.md | 6 ++++ ticdc/ticdc-integrity-check.md | 56 ++++++++++++++++++++++++++++++++ 4 files changed, 71 insertions(+) create mode 100644 ticdc/ticdc-integrity-check.md diff --git a/TOC.md b/TOC.md index 25904234ae8c4..d6ebfed122bef 100644 --- a/TOC.md +++ b/TOC.md @@ -526,6 +526,7 @@ - [Manage Changefeeds](/ticdc/ticdc-manage-changefeed.md) - [Log Filter](/ticdc/ticdc-filter.md) - [Bidirectional Replication](/ticdc/ticdc-bidirectional-replication.md) + - [Data Integrity Validation](/ticdc/ticdc-integrity-check.md) - Monitor and Alert - [Monitoring Metrics](/ticdc/monitor-ticdc.md) - [Alert Rules](/ticdc/ticdc-alert-rules.md) diff --git a/system-variables.md b/system-variables.md index 4a5a1615ff260..58b658dccc73b 100644 --- a/system-variables.md +++ b/system-variables.md @@ -2065,6 +2065,14 @@ Query OK, 0 rows affected (0.09 sec) - Default value: `ON` - This variable is used to control whether to enable the support for window functions. Note that window functions may use reserved keywords. This might cause SQL statements that could be executed normally cannot be parsed after upgrading TiDB. In this case, you can set `tidb_enable_window_function` to `OFF`. +### `tidb_enable_row_level_checksum` New in v7.1.0 + +- Scope: GLOBAL +- Persists to cluster: Yes +- Type: Boolean +- Default value: `OFF` +- This variable is used to control whether to enable the [TiCDC data integrity validation](/ticdc/ticdc-integrity-check.md) feature. + ### tidb_enforce_mpp New in v5.1 - Scope: SESSION diff --git a/ticdc/ticdc-changefeed-config.md b/ticdc/ticdc-changefeed-config.md index 6bbd79d8e4452..18aeec8fd433e 100644 --- a/ticdc/ticdc-changefeed-config.md +++ b/ticdc/ticdc-changefeed-config.md @@ -131,4 +131,10 @@ quote = '"' null = '\N' # Whether to include commit-ts in CSV rows. The default value is false. include-commit-ts = false + +[integrity] +# Whether to enable the checksum validation for single-row data. The default value is "none", which means to diable the feature. Value options are "none" and "correctness". +integrity-check-level = "none" +# Specifies the log level of the Changefeed when the checksum validation for single-row data fails. The default value is "warn". Value options are "warn" and "error". +corruption-handle-level = "warn" ``` diff --git a/ticdc/ticdc-integrity-check.md b/ticdc/ticdc-integrity-check.md new file mode 100644 index 0000000000000..24383840657d8 --- /dev/null +++ b/ticdc/ticdc-integrity-check.md @@ -0,0 +1,56 @@ +--- +title: TiCDC Data Integrity Validation +summary: Introduce the implementation principle and usage of the TiCDC data integrity validation feature. +--- + +# TiCDC Data Integrity Validation + +Starting from v7.1.0, TiCDC introduces the data integrity validation feature, which uses a checksum algorithm to validate the integrity of single-row data. This feature helps verify whether any error occurs in the process of writing data from TiDB, synchronizing it through TiCDC, and then writing it to a Kafka cluster. The data integrity validation feature only supports changefeeds that use Kafka as the downstream and currently supports the Avro protocol. + +## Implementation principles + +After you enable the checksum integrity validation feature for single-row data, TiDB uses the CRC32 algorithm to calculate the checksum of a row and writes it to TiKV along with the data. TiCDC reads the data from TiKV and recalculates the checksum using the same algorithm. If the two checksums are equal, it indicates that the data is accurate during the transmission from TiDB to TiCDC. + +TiCDC then encodes the data into a specific format and sends it to Kafka. After the Kafka Consumer reads data, it calculates a new checksum using the same algorithm as TiDB. If the new checksum is equal to the checksum in the data, it indicates that the data is accurate during the transmission from TiCDC to the Kafka Consumer. + +## Enable the feature + +TiCDC disables data integrity validation by default. To enable it, perform the following steps: + +1. Enable the checksum integrity validation feature for single-row data in the upstream TiDB cluster by setting the [`tidb_enable_row_level_checksum`](/system-variables.md#tidb_enable_row_level_checksum-new-in-v710) system variable: + + ```sql + SET GLOBAL tidb_enable_row_level_checksum = ON; + ``` + + This configuration only takes effect for newly created sessions, so you need to reconnect to TiDB. + +2. In the configuration file specified by the `--config` parameter when creating a changefeed, add the following configurations: + + ```toml + [integrity] + integrity-check-level = "correctness" + corruption-handle-level = "warn" + ``` + +3. When using Avro as the data encoding format, you need to set [`enable-tidb-extension=true`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka), [`avro-decimal-handling-mode=string`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka), and [`avro-bigint-unsigned-handling-mode=string`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka) in the [`sink-uri`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka). The following is an example: + + ```shell + cdc cli changefeed create --server=http://127.0.0.1:8300 --changefeed-id="kafka-avro-enable-extension" --sink-uri="kafka://127.0.0.1:9092/topic-name?protocol=avro&enable-tidb-extension=true&avro-decimal-handling-mode=string&avro-bigint-unsigned-handling-mode=string" --schema-registry=http://127.0.0.1:8081 --config changefeed_config.toml + ``` + + With the preceding configuration, each message written to Kafka by the Changefeed will include the corresponding data's checksum. You can verify data consistency based on these checksum values. + +## Disable the feature + +TiCDC disables data integrity validation by default. To disable this feature after enabling it, perform the following steps: + +1. Follow the `Pause Task -> Modify Configuration -> Resume Task` process described in [Update task configuration](/ticdc/ticdc-manage-changefeed.md#update-task-configuration) and remove all `[integrity]` configurations in the configuration file specified by the `--config` parameter of the Changefeed. + +2. Execute the following SQL statement in the upstream TiDB to disable the checksum integrity validation feature ([`tidb_enable_row_level_checksum`](/system-variables.md#tidb_enable_row_level_checksum-new-in-v710)): + + ```sql + SET GLOBAL tidb_enable_row_level_checksum = OFF; + ``` + + The preceding configuration only takes effect for newly created sessions. After all clients writing to TiDB have reconnected, the messages written by Changefeed to Kafka will no longer include the checksum for the corresponding data. From f0380661c0fcf1a03a26ffe6fc747c6d5ed693ba Mon Sep 17 00:00:00 2001 From: Aolin Date: Tue, 18 Apr 2023 16:27:02 +0800 Subject: [PATCH 02/13] Apply suggestions from code review Co-authored-by: xixirangrang --- ticdc/ticdc-changefeed-config.md | 2 +- ticdc/ticdc-integrity-check.md | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ticdc/ticdc-changefeed-config.md b/ticdc/ticdc-changefeed-config.md index 18aeec8fd433e..f3cda853097c3 100644 --- a/ticdc/ticdc-changefeed-config.md +++ b/ticdc/ticdc-changefeed-config.md @@ -133,7 +133,7 @@ null = '\N' include-commit-ts = false [integrity] -# Whether to enable the checksum validation for single-row data. The default value is "none", which means to diable the feature. Value options are "none" and "correctness". +# Whether to enable the checksum validation for single-row data. The default value is "none", which means to disable the feature. Value options are "none" and "correctness". integrity-check-level = "none" # Specifies the log level of the Changefeed when the checksum validation for single-row data fails. The default value is "warn". Value options are "warn" and "error". corruption-handle-level = "warn" diff --git a/ticdc/ticdc-integrity-check.md b/ticdc/ticdc-integrity-check.md index 24383840657d8..37723b7d73afd 100644 --- a/ticdc/ticdc-integrity-check.md +++ b/ticdc/ticdc-integrity-check.md @@ -5,13 +5,13 @@ summary: Introduce the implementation principle and usage of the TiCDC data inte # TiCDC Data Integrity Validation -Starting from v7.1.0, TiCDC introduces the data integrity validation feature, which uses a checksum algorithm to validate the integrity of single-row data. This feature helps verify whether any error occurs in the process of writing data from TiDB, synchronizing it through TiCDC, and then writing it to a Kafka cluster. The data integrity validation feature only supports changefeeds that use Kafka as the downstream and currently supports the Avro protocol. +Starting from v7.1.0, TiCDC introduces the data integrity validation feature, which uses a checksum algorithm to validate the integrity of single-row data. This feature helps verify whether any error occurs in the process of writing data from TiDB, replicating it through TiCDC, and then writing it to a Kafka cluster. The data integrity validation feature only supports changefeeds that use Kafka as the downstream and currently supports the Avro protocol. ## Implementation principles -After you enable the checksum integrity validation feature for single-row data, TiDB uses the CRC32 algorithm to calculate the checksum of a row and writes it to TiKV along with the data. TiCDC reads the data from TiKV and recalculates the checksum using the same algorithm. If the two checksums are equal, it indicates that the data is accurate during the transmission from TiDB to TiCDC. +After you enable the checksum integrity validation feature for single-row data, TiDB uses the CRC32 algorithm to calculate the checksum of a row and writes it to TiKV along with the data. TiCDC reads the data from TiKV and recalculates the checksum using the same algorithm. If the two checksums are equal, it indicates that the data is consistent during the transmission from TiDB to TiCDC. -TiCDC then encodes the data into a specific format and sends it to Kafka. After the Kafka Consumer reads data, it calculates a new checksum using the same algorithm as TiDB. If the new checksum is equal to the checksum in the data, it indicates that the data is accurate during the transmission from TiCDC to the Kafka Consumer. +TiCDC then encodes the data into a specific format and sends it to Kafka. After the Kafka Consumer reads data, it calculates a new checksum using the same algorithm as TiDB. If the new checksum is equal to the checksum in the data, it indicates that the data is consistent during the transmission from TiCDC to the Kafka Consumer. ## Enable the feature @@ -25,7 +25,7 @@ TiCDC disables data integrity validation by default. To enable it, perform the f This configuration only takes effect for newly created sessions, so you need to reconnect to TiDB. -2. In the configuration file specified by the `--config` parameter when creating a changefeed, add the following configurations: +2. In the configuration file specified by the `--config` parameter when you create a changefeed, add the following configurations: ```toml [integrity] From 15b1eca81673c80deaecfaa64a4eefb128dc27e3 Mon Sep 17 00:00:00 2001 From: Aolin Date: Thu, 20 Apr 2023 10:47:54 +0800 Subject: [PATCH 03/13] make ci happy --- system-variables.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/system-variables.md b/system-variables.md index 58b658dccc73b..a020cc2cea0ef 100644 --- a/system-variables.md +++ b/system-variables.md @@ -2071,8 +2071,19 @@ Query OK, 0 rows affected (0.09 sec) - Persists to cluster: Yes - Type: Boolean - Default value: `OFF` + + + - This variable is used to control whether to enable the [TiCDC data integrity validation](/ticdc/ticdc-integrity-check.md) feature. + + + + +- This variable is used to control whether to enable the [TiCDC data integrity validation](https://docs.pingcap.com/tidb/stable/ticdc-integrity-check) feature. + + + ### tidb_enforce_mpp New in v5.1 - Scope: SESSION From b1c3e1737417bf5b5853632946d38bfffdc9350e Mon Sep 17 00:00:00 2001 From: Aolin Date: Thu, 20 Apr 2023 13:52:22 +0800 Subject: [PATCH 04/13] Apply suggestions from code review --- system-variables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/system-variables.md b/system-variables.md index a020cc2cea0ef..1bdf2d25a39bc 100644 --- a/system-variables.md +++ b/system-variables.md @@ -2080,7 +2080,7 @@ Query OK, 0 rows affected (0.09 sec) -- This variable is used to control whether to enable the [TiCDC data integrity validation](https://docs.pingcap.com/tidb/stable/ticdc-integrity-check) feature. +- This variable is used to control whether to enable the [TiCDC data integrity validation](https://docs.pingcap.com/tidb/dev/ticdc-integrity-check) feature. From 606a0cb77738c7df793bdb50c386ec247eb24380 Mon Sep 17 00:00:00 2001 From: Aolin Date: Wed, 17 May 2023 22:17:12 +0800 Subject: [PATCH 05/13] add TIDB_ROW_CHECKSUM and implementation details Signed-off-by: Aolin --- functions-and-operators/tidb-functions.md | 45 +++++++++++++++++++++++ ticdc/ticdc-integrity-check.md | 44 +++++++++++++++++++++- 2 files changed, 87 insertions(+), 2 deletions(-) diff --git a/functions-and-operators/tidb-functions.md b/functions-and-operators/tidb-functions.md index 7462c91925b50..cbe4479db0885 100644 --- a/functions-and-operators/tidb-functions.md +++ b/functions-and-operators/tidb-functions.md @@ -18,6 +18,7 @@ The following functions are TiDB extensions, and are not present in MySQL: | [`TIDB_DECODE_SQL_DIGESTS(digests, stmtTruncateLength)`](#tidb_decode_sql_digests) | The `TIDB_DECODE_SQL_DIGESTS()` function is used to query the normalized SQL statements (a form without formats and arguments) corresponding to the set of SQL digests in the cluster. | | `VITESS_HASH(str)` | The `VITESS_HASH` function returns the hash of a string that is compatible with Vitess' `HASH` function. This is intended to help the data migration from Vitess. | | `TIDB_SHARD()` | The `TIDB_SHARD` function can be used to create a shard index to scatter the index hotspot. A shard index is an expression index with a `TIDB_SHARD` function as the prefix.| +| `TIDB_ROW_CHECKSUM()` | The `TIDB_ROW_CHECKSUM` function is used to query the checksum value of a row. This function can only be used in `SELECT` statements within the FastPlan process. That is, you can query through statements like `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id = ?` or `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id IN (?, ?, ...)`. See also: [Data Integrity Validation for Single-Row Data](/ticdc/ticdc-integrity-check.md). | ## Examples @@ -297,3 +298,47 @@ TIDBShardExpr ::= ```sql CREATE TABLE test(id INT PRIMARY KEY CLUSTERED, a INT, b INT, UNIQUE KEY uk((tidb_shard(a)), a)); ``` + +### TIDB_ROW_CHECKSUM + +The `TIDB_ROW_CHECKSUM` function is used to query the checksum value of a row. This function can only be used in `SELECT` statements within the FastPlan process. That is, you can query through statements like `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id = ?` or `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id IN (?, ?, ...)`. + +The synopsis is as follows: + +```ebnf+diagram +TableStmt ::= + "TIDB_ROW_CHECKSUM()" +``` + +The following example shows how to use the `TIDB_ROW_CHECKSUM` function to query the Checksum value of the row data: + +To enable the checksum feature of single-row data in TiDB (controlled by the system variable [`tidb_enable_row_level_checksum`](/system-variables.md#tidb_enable_row_level_checksum-new-in-v710)), run the following statement: + +```sql +SET GLOBAL tidb_enable_row_level_checksum = ON; +``` + +Create table `t` and insert data: + +```sql +USE test; +CREATE TABLE t (id INT PRIMARY KEY, k INT, c int); +INSERT INTO TABLE t values (1, 10, a); +``` + +The following statement shows how to query the checksum value of the row where `id = 1` in table `t`: + +```sql +SELECT *, TIDB_ROW_CHECKSUM() FROM t WHERE id = 1; +``` + +The output is as follows: + +```sql ++----+------+------+---------------------+ +| id | k | c | TIDB_ROW_CHECKSUM() | ++----+------+------+---------------------+ +| 1 | 10 | a | 3813955661 | ++----+------+------+---------------------+ +1 row in set (0.000 sec) +``` diff --git a/ticdc/ticdc-integrity-check.md b/ticdc/ticdc-integrity-check.md index 37723b7d73afd..5799f6958a72f 100644 --- a/ticdc/ticdc-integrity-check.md +++ b/ticdc/ticdc-integrity-check.md @@ -1,9 +1,9 @@ --- -title: TiCDC Data Integrity Validation +title: TiCDC Data Integrity Validation for Single-Row Data summary: Introduce the implementation principle and usage of the TiCDC data integrity validation feature. --- -# TiCDC Data Integrity Validation +# TiCDC Data Integrity Validation for Single-Row Data Starting from v7.1.0, TiCDC introduces the data integrity validation feature, which uses a checksum algorithm to validate the integrity of single-row data. This feature helps verify whether any error occurs in the process of writing data from TiDB, replicating it through TiCDC, and then writing it to a Kafka cluster. The data integrity validation feature only supports changefeeds that use Kafka as the downstream and currently supports the Avro protocol. @@ -13,6 +13,8 @@ After you enable the checksum integrity validation feature for single-row data, TiCDC then encodes the data into a specific format and sends it to Kafka. After the Kafka Consumer reads data, it calculates a new checksum using the same algorithm as TiDB. If the new checksum is equal to the checksum in the data, it indicates that the data is consistent during the transmission from TiCDC to the Kafka Consumer. +For more information about the algorithm of the checksum, see [Algorithm for checksum calculation](#algorithm-for-checksum-calculation). + ## Enable the feature TiCDC disables data integrity validation by default. To enable it, perform the following steps: @@ -47,6 +49,12 @@ TiCDC disables data integrity validation by default. To disable this feature aft 1. Follow the `Pause Task -> Modify Configuration -> Resume Task` process described in [Update task configuration](/ticdc/ticdc-manage-changefeed.md#update-task-configuration) and remove all `[integrity]` configurations in the configuration file specified by the `--config` parameter of the Changefeed. + ```toml + [integrity] + integrity-check-level = "none" + corruption-handle-level = "warn" + ``` + 2. Execute the following SQL statement in the upstream TiDB to disable the checksum integrity validation feature ([`tidb_enable_row_level_checksum`](/system-variables.md#tidb_enable_row_level_checksum-new-in-v710)): ```sql @@ -54,3 +62,35 @@ TiCDC disables data integrity validation by default. To disable this feature aft ``` The preceding configuration only takes effect for newly created sessions. After all clients writing to TiDB have reconnected, the messages written by Changefeed to Kafka will no longer include the checksum for the corresponding data. + +## Algorithm for checksum calculation + +The pseudocode for the checksum calculation algorithm is as follows: + +``` +fn checksum(columns) { + let result = 0 + for column in sort_by_schema_order(columns) { + result = crc32.update(result, encode(column)) + } + return result +} +``` + +* `columns` should be sorted by column ID. In the Avro schema, fields are already sorted by column ID, so you can directly use the order in `columns`. + +* The `encode(column)` function encodes the column value into bytes. Encoding rules vary based on the data type of the column. The specific rules are as follows: + + * TINYINT, SMALLINT, INT, BIGINT, MEDIUMINT, and YEAR types are converted to UINT64 and encoded in little-endian. For example, the number `0x0123456789abcdef` is encoded as `hex'0x0123456789abcdef'`. + * FLOAT and DOUBLE types are converted to DOUBLE and then encoded as UINT64 in IEEE754 format. + * BIT, ENUM, and SET types are converted to UINT64. + + * BIT type is converted to UINT64 in binary format. + * ENUM and SET types are converted to their corresponding INT values in UINT64. For example, if the data value of a `SET('a','b','c')` type column is `'a,c'`, the value is encoded as `0b101`. + + * TIMESTAMP, DATE, DURATION, DATETIME, JSON, and DECIMAL types are converted to STRING and then encoded as UTF8 bytes. + * VARBIANRY, BINARY, and BLOB types (including TINY, MEDIUM, and LONG) are directly encoded as bytes. + * VARCHAR, CHAR, and TEXT types (including TINY, MEDIUM, and LONG) are encoded as UTF8 bytes. + * NULL and GEOMETRY types are excluded from the checksum calculation and this function returns empty bytes. + +The consumer code written in Golang implements steps such as decoding data read from Kafka, sorting by schema fields, and calculating the checksum value. For more information, see [`avro/decoder.go`](https://github.com/pingcap/tiflow/blob/master/pkg/sink/codec/avro/decoder.go). From 3ddcb04853a6e97cef545dce4dd02f672757cc3a Mon Sep 17 00:00:00 2001 From: Aolin Date: Thu, 18 May 2023 10:22:12 +0800 Subject: [PATCH 06/13] Apply suggestions from code review Co-authored-by: xixirangrang --- TOC.md | 2 +- system-variables.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/TOC.md b/TOC.md index d6ebfed122bef..c65ef6d84453a 100644 --- a/TOC.md +++ b/TOC.md @@ -526,7 +526,7 @@ - [Manage Changefeeds](/ticdc/ticdc-manage-changefeed.md) - [Log Filter](/ticdc/ticdc-filter.md) - [Bidirectional Replication](/ticdc/ticdc-bidirectional-replication.md) - - [Data Integrity Validation](/ticdc/ticdc-integrity-check.md) + - [Data Integrity Validation for Single-Row Data](/ticdc/ticdc-integrity-check.md) - Monitor and Alert - [Monitoring Metrics](/ticdc/monitor-ticdc.md) - [Alert Rules](/ticdc/ticdc-alert-rules.md) diff --git a/system-variables.md b/system-variables.md index 1bdf2d25a39bc..0d09c2c24841f 100644 --- a/system-variables.md +++ b/system-variables.md @@ -2074,13 +2074,13 @@ Query OK, 0 rows affected (0.09 sec) -- This variable is used to control whether to enable the [TiCDC data integrity validation](/ticdc/ticdc-integrity-check.md) feature. +- This variable is used to control whether to enable the [TiCDC data integrity validation for single-row data](/ticdc/ticdc-integrity-check.md) feature. -- This variable is used to control whether to enable the [TiCDC data integrity validation](https://docs.pingcap.com/tidb/dev/ticdc-integrity-check) feature. +- This variable is used to control whether to enable the [TiCDC data integrity validation for single-row data](https://docs.pingcap.com/tidb/dev/ticdc-integrity-check) feature. From 2673b5cce07738bc6d0868fccd2a1404479223df Mon Sep 17 00:00:00 2001 From: Aolin Date: Thu, 18 May 2023 11:51:32 +0800 Subject: [PATCH 07/13] Update tidb-functions.md --- functions-and-operators/tidb-functions.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/functions-and-operators/tidb-functions.md b/functions-and-operators/tidb-functions.md index cbe4479db0885..cc0430521faec 100644 --- a/functions-and-operators/tidb-functions.md +++ b/functions-and-operators/tidb-functions.md @@ -7,6 +7,8 @@ summary: Learn about the usage of TiDB specific functions. The following functions are TiDB extensions, and are not present in MySQL: + + | Function name | Function description | | :-------------- | :------------------------------------- | | `TIDB_BOUNDED_STALENESS()` | The `TIDB_BOUNDED_STALENESS` function instructs TiDB to read the data as new as possible within the time range. See also: [Read Historical Data Using the `AS OF TIMESTAMP` Clause](/as-of-timestamp.md) | @@ -20,6 +22,25 @@ The following functions are TiDB extensions, and are not present in MySQL: | `TIDB_SHARD()` | The `TIDB_SHARD` function can be used to create a shard index to scatter the index hotspot. A shard index is an expression index with a `TIDB_SHARD` function as the prefix.| | `TIDB_ROW_CHECKSUM()` | The `TIDB_ROW_CHECKSUM` function is used to query the checksum value of a row. This function can only be used in `SELECT` statements within the FastPlan process. That is, you can query through statements like `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id = ?` or `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id IN (?, ?, ...)`. See also: [Data Integrity Validation for Single-Row Data](/ticdc/ticdc-integrity-check.md). | + + + + +| Function name | Function description | +| :-------------- | :------------------------------------- | +| `TIDB_BOUNDED_STALENESS()` | The `TIDB_BOUNDED_STALENESS` function instructs TiDB to read the data as new as possible within the time range. See also: [Read Historical Data Using the `AS OF TIMESTAMP` Clause](/as-of-timestamp.md) | +| [`TIDB_DECODE_KEY(str)`](#tidb_decode_key) | The `TIDB_DECODE_KEY` function can be used to decode a TiDB-encoded key entry into a JSON structure containing `_tidb_rowid` and `table_id`. These encoded keys can be found in some system tables and in logging outputs. | +| [`TIDB_DECODE_PLAN(str)`](#tidb_decode_plan) | The `TIDB_DECODE_PLAN` function can be used to decode a TiDB execution plan. | +| `TIDB_IS_DDL_OWNER()` | The `TIDB_IS_DDL_OWNER` function can be used to check whether or not the TiDB instance you are connected to is the one that is the DDL Owner. The DDL Owner is the TiDB instance that is tasked with executing DDL statements on behalf of all other nodes in the cluster. | +| [`TIDB_PARSE_TSO(num)`](#tidb_parse_tso) | The `TIDB_PARSE_TSO` function can be used to extract the physical timestamp from a TiDB TSO timestamp. See also: [`tidb_current_ts`](/system-variables.md#tidb_current_ts). | +| [`TIDB_VERSION()`](#tidb_version) | The `TIDB_VERSION` function returns the TiDB version with additional build information. | +| [`TIDB_DECODE_SQL_DIGESTS(digests, stmtTruncateLength)`](#tidb_decode_sql_digests) | The `TIDB_DECODE_SQL_DIGESTS()` function is used to query the normalized SQL statements (a form without formats and arguments) corresponding to the set of SQL digests in the cluster. | +| `VITESS_HASH(str)` | The `VITESS_HASH` function returns the hash of a string that is compatible with Vitess' `HASH` function. This is intended to help the data migration from Vitess. | +| `TIDB_SHARD()` | The `TIDB_SHARD` function can be used to create a shard index to scatter the index hotspot. A shard index is an expression index with a `TIDB_SHARD` function as the prefix.| +| `TIDB_ROW_CHECKSUM()` | The `TIDB_ROW_CHECKSUM` function is used to query the checksum value of a row. This function can only be used in `SELECT` statements within the FastPlan process. That is, you can query through statements like `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id = ?` or `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id IN (?, ?, ...)`. See also: [Data Integrity Validation for Single-Row Data](https://docs.pingcap.com/tidb/stable/ticdc-integrity-check). | + + + ## Examples This section provides examples for some of the functions above. From 4fab76a09b5fd0c7fbb9a9067fd91136dbf52320 Mon Sep 17 00:00:00 2001 From: Aolin Date: Thu, 18 May 2023 11:51:57 +0800 Subject: [PATCH 08/13] Apply suggestions from code review --- system-variables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/system-variables.md b/system-variables.md index 0d09c2c24841f..5538ca6ecebf5 100644 --- a/system-variables.md +++ b/system-variables.md @@ -2080,7 +2080,7 @@ Query OK, 0 rows affected (0.09 sec) -- This variable is used to control whether to enable the [TiCDC data integrity validation for single-row data](https://docs.pingcap.com/tidb/dev/ticdc-integrity-check) feature. +- This variable is used to control whether to enable the [TiCDC data integrity validation for single-row data](https://docs.pingcap.com/tidb/stable/ticdc-integrity-check) feature. From f89bd1bf104d38fc1d3500af34ee5a7e11e58c93 Mon Sep 17 00:00:00 2001 From: Aolin Date: Thu, 18 May 2023 11:52:46 +0800 Subject: [PATCH 09/13] Apply suggestions from code review --- functions-and-operators/tidb-functions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/functions-and-operators/tidb-functions.md b/functions-and-operators/tidb-functions.md index cc0430521faec..874dd946e5db8 100644 --- a/functions-and-operators/tidb-functions.md +++ b/functions-and-operators/tidb-functions.md @@ -20,7 +20,7 @@ The following functions are TiDB extensions, and are not present in MySQL: | [`TIDB_DECODE_SQL_DIGESTS(digests, stmtTruncateLength)`](#tidb_decode_sql_digests) | The `TIDB_DECODE_SQL_DIGESTS()` function is used to query the normalized SQL statements (a form without formats and arguments) corresponding to the set of SQL digests in the cluster. | | `VITESS_HASH(str)` | The `VITESS_HASH` function returns the hash of a string that is compatible with Vitess' `HASH` function. This is intended to help the data migration from Vitess. | | `TIDB_SHARD()` | The `TIDB_SHARD` function can be used to create a shard index to scatter the index hotspot. A shard index is an expression index with a `TIDB_SHARD` function as the prefix.| -| `TIDB_ROW_CHECKSUM()` | The `TIDB_ROW_CHECKSUM` function is used to query the checksum value of a row. This function can only be used in `SELECT` statements within the FastPlan process. That is, you can query through statements like `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id = ?` or `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id IN (?, ?, ...)`. See also: [Data Integrity Validation for Single-Row Data](/ticdc/ticdc-integrity-check.md). | +| `TIDB_ROW_CHECKSUM()` | The `TIDB_ROW_CHECKSUM` function is used to query the checksum value of a row. This function can only be used in `SELECT` statements within the FastPlan process. That is, you can query through statements like `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id = ?` or `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id IN (?, ?, ...)`. See also: [Data integrity validation for single-row data](/ticdc/ticdc-integrity-check.md). | @@ -37,7 +37,7 @@ The following functions are TiDB extensions, and are not present in MySQL: | [`TIDB_DECODE_SQL_DIGESTS(digests, stmtTruncateLength)`](#tidb_decode_sql_digests) | The `TIDB_DECODE_SQL_DIGESTS()` function is used to query the normalized SQL statements (a form without formats and arguments) corresponding to the set of SQL digests in the cluster. | | `VITESS_HASH(str)` | The `VITESS_HASH` function returns the hash of a string that is compatible with Vitess' `HASH` function. This is intended to help the data migration from Vitess. | | `TIDB_SHARD()` | The `TIDB_SHARD` function can be used to create a shard index to scatter the index hotspot. A shard index is an expression index with a `TIDB_SHARD` function as the prefix.| -| `TIDB_ROW_CHECKSUM()` | The `TIDB_ROW_CHECKSUM` function is used to query the checksum value of a row. This function can only be used in `SELECT` statements within the FastPlan process. That is, you can query through statements like `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id = ?` or `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id IN (?, ?, ...)`. See also: [Data Integrity Validation for Single-Row Data](https://docs.pingcap.com/tidb/stable/ticdc-integrity-check). | +| `TIDB_ROW_CHECKSUM()` | The `TIDB_ROW_CHECKSUM` function is used to query the checksum value of a row. This function can only be used in `SELECT` statements within the FastPlan process. That is, you can query through statements like `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id = ?` or `SELECT TIDB_ROW_CHECKSUM() FROM t WHERE id IN (?, ?, ...)`. See also: [Data integrity validation for single-row data](https://docs.pingcap.com/tidb/stable/ticdc-integrity-check). | From b2e8d562f56777624b8c359c6e25f1a3f3893df9 Mon Sep 17 00:00:00 2001 From: Aolin Date: Mon, 22 May 2023 17:25:22 +0800 Subject: [PATCH 10/13] Apply suggestions from code review --- ticdc/ticdc-integrity-check.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ticdc/ticdc-integrity-check.md b/ticdc/ticdc-integrity-check.md index 5799f6958a72f..fcc25fe08aa6e 100644 --- a/ticdc/ticdc-integrity-check.md +++ b/ticdc/ticdc-integrity-check.md @@ -27,7 +27,7 @@ TiCDC disables data integrity validation by default. To enable it, perform the f This configuration only takes effect for newly created sessions, so you need to reconnect to TiDB. -2. In the configuration file specified by the `--config` parameter when you create a changefeed, add the following configurations: +2. In the [configuration file](/ticdc/ticdc-changefeed-config.md##changefeed-configuration-parameters) specified by the `--config` parameter when you create a changefeed, add the following configurations: ```toml [integrity] @@ -35,7 +35,7 @@ TiCDC disables data integrity validation by default. To enable it, perform the f corruption-handle-level = "warn" ``` -3. When using Avro as the data encoding format, you need to set [`enable-tidb-extension=true`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka), [`avro-decimal-handling-mode=string`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka), and [`avro-bigint-unsigned-handling-mode=string`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka) in the [`sink-uri`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka). The following is an example: +3. When using Avro as the data encoding format, you need to set [`enable-tidb-extension=true`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka) in the [`sink-uri`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka). To prevent numerical precision loss during network transmission, which can cause Checksum validation failures, you also need to set [`avro-decimal-handling-mode=string`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka) and [`avro-bigint-unsigned-handling-mode=string`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka). The following is an example: ```shell cdc cli changefeed create --server=http://127.0.0.1:8300 --changefeed-id="kafka-avro-enable-extension" --sink-uri="kafka://127.0.0.1:9092/topic-name?protocol=avro&enable-tidb-extension=true&avro-decimal-handling-mode=string&avro-bigint-unsigned-handling-mode=string" --schema-registry=http://127.0.0.1:8081 --config changefeed_config.toml @@ -43,6 +43,10 @@ TiCDC disables data integrity validation by default. To enable it, perform the f With the preceding configuration, each message written to Kafka by the Changefeed will include the corresponding data's checksum. You can verify data consistency based on these checksum values. + > **Note:** + > + > For existing Changefeeds, if `avro-decimal-handling-mode` and `avro-bigint-unsigned-handling-mode` are not set, enabling the Checksum validation feature might cause Schema compatibility issues. To resolve this issue, you can modify the compatibility type of the Schema Registry to `NONE`. For more details, see [Schema Registry](https://docs.confluent.io/platform/current/schema-registry/fundamentals/avro.html#no-compatibility-checking). + ## Disable the feature TiCDC disables data integrity validation by default. To disable this feature after enabling it, perform the following steps: @@ -93,4 +97,8 @@ fn checksum(columns) { * VARCHAR, CHAR, and TEXT types (including TINY, MEDIUM, and LONG) are encoded as UTF8 bytes. * NULL and GEOMETRY types are excluded from the checksum calculation and this function returns empty bytes. +> **Note:** +> +> After enabling the Checksum validation feature, DECIMAL and UNSIGNED BIGINT types data will be converted to string types. Therefore, in the downstream consumer code, you need to convert them back to their corresponding numerical types before calculating Checksum values. + The consumer code written in Golang implements steps such as decoding data read from Kafka, sorting by schema fields, and calculating the checksum value. For more information, see [`avro/decoder.go`](https://github.com/pingcap/tiflow/blob/master/pkg/sink/codec/avro/decoder.go). From ce16d913f5cdc45f883fd879683b1bc7c5ce2226 Mon Sep 17 00:00:00 2001 From: Aolin Date: Tue, 23 May 2023 12:22:09 +0800 Subject: [PATCH 11/13] fix ci --- ticdc/ticdc-integrity-check.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ticdc/ticdc-integrity-check.md b/ticdc/ticdc-integrity-check.md index fcc25fe08aa6e..c1868d0506f0e 100644 --- a/ticdc/ticdc-integrity-check.md +++ b/ticdc/ticdc-integrity-check.md @@ -27,7 +27,7 @@ TiCDC disables data integrity validation by default. To enable it, perform the f This configuration only takes effect for newly created sessions, so you need to reconnect to TiDB. -2. In the [configuration file](/ticdc/ticdc-changefeed-config.md##changefeed-configuration-parameters) specified by the `--config` parameter when you create a changefeed, add the following configurations: +2. In the [configuration file](/ticdc/ticdc-changefeed-config.md#changefeed-configuration-parameters) specified by the `--config` parameter when you create a changefeed, add the following configurations: ```toml [integrity] From 099685b70816e6d0831a67389e08d29ed568de96 Mon Sep 17 00:00:00 2001 From: Aolin Date: Tue, 23 May 2023 13:31:55 +0800 Subject: [PATCH 12/13] refine wording Signed-off-by: Aolin --- functions-and-operators/tidb-functions.md | 2 +- ticdc/ticdc-integrity-check.md | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/functions-and-operators/tidb-functions.md b/functions-and-operators/tidb-functions.md index 874dd946e5db8..e35c4a6115287 100644 --- a/functions-and-operators/tidb-functions.md +++ b/functions-and-operators/tidb-functions.md @@ -331,7 +331,7 @@ TableStmt ::= "TIDB_ROW_CHECKSUM()" ``` -The following example shows how to use the `TIDB_ROW_CHECKSUM` function to query the Checksum value of the row data: +The following example shows how to use the `TIDB_ROW_CHECKSUM` function to query the checksum value of the row data: To enable the checksum feature of single-row data in TiDB (controlled by the system variable [`tidb_enable_row_level_checksum`](/system-variables.md#tidb_enable_row_level_checksum-new-in-v710)), run the following statement: diff --git a/ticdc/ticdc-integrity-check.md b/ticdc/ticdc-integrity-check.md index c1868d0506f0e..1bac68fe9b7a2 100644 --- a/ticdc/ticdc-integrity-check.md +++ b/ticdc/ticdc-integrity-check.md @@ -35,23 +35,23 @@ TiCDC disables data integrity validation by default. To enable it, perform the f corruption-handle-level = "warn" ``` -3. When using Avro as the data encoding format, you need to set [`enable-tidb-extension=true`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka) in the [`sink-uri`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka). To prevent numerical precision loss during network transmission, which can cause Checksum validation failures, you also need to set [`avro-decimal-handling-mode=string`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka) and [`avro-bigint-unsigned-handling-mode=string`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka). The following is an example: +3. When using Avro as the data encoding format, you need to set [`enable-tidb-extension=true`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka) in the [`sink-uri`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka). To prevent numerical precision loss during network transmission, which can cause checksum validation failures, you also need to set [`avro-decimal-handling-mode=string`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka) and [`avro-bigint-unsigned-handling-mode=string`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka). The following is an example: ```shell cdc cli changefeed create --server=http://127.0.0.1:8300 --changefeed-id="kafka-avro-enable-extension" --sink-uri="kafka://127.0.0.1:9092/topic-name?protocol=avro&enable-tidb-extension=true&avro-decimal-handling-mode=string&avro-bigint-unsigned-handling-mode=string" --schema-registry=http://127.0.0.1:8081 --config changefeed_config.toml ``` - With the preceding configuration, each message written to Kafka by the Changefeed will include the corresponding data's checksum. You can verify data consistency based on these checksum values. + With the preceding configuration, each message written to Kafka by the changefeed will include the corresponding data's checksum. You can verify data consistency based on these checksum values. > **Note:** > - > For existing Changefeeds, if `avro-decimal-handling-mode` and `avro-bigint-unsigned-handling-mode` are not set, enabling the Checksum validation feature might cause Schema compatibility issues. To resolve this issue, you can modify the compatibility type of the Schema Registry to `NONE`. For more details, see [Schema Registry](https://docs.confluent.io/platform/current/schema-registry/fundamentals/avro.html#no-compatibility-checking). + > For existing changefeeds, if `avro-decimal-handling-mode` and `avro-bigint-unsigned-handling-mode` are not set, enabling the checksum validation feature might cause schema compatibility issues. To resolve this issue, you can modify the compatibility type of the Schema Registry to `NONE`. For more details, see [Schema Registry](https://docs.confluent.io/platform/current/schema-registry/fundamentals/avro.html#no-compatibility-checking). ## Disable the feature TiCDC disables data integrity validation by default. To disable this feature after enabling it, perform the following steps: -1. Follow the `Pause Task -> Modify Configuration -> Resume Task` process described in [Update task configuration](/ticdc/ticdc-manage-changefeed.md#update-task-configuration) and remove all `[integrity]` configurations in the configuration file specified by the `--config` parameter of the Changefeed. +1. Follow the `Pause Task -> Modify Configuration -> Resume Task` process described in [Update task configuration](/ticdc/ticdc-manage-changefeed.md#update-task-configuration) and remove all `[integrity]` configurations in the configuration file specified by the `--config` parameter of the changefeed. ```toml [integrity] @@ -65,7 +65,7 @@ TiCDC disables data integrity validation by default. To disable this feature aft SET GLOBAL tidb_enable_row_level_checksum = OFF; ``` - The preceding configuration only takes effect for newly created sessions. After all clients writing to TiDB have reconnected, the messages written by Changefeed to Kafka will no longer include the checksum for the corresponding data. + The preceding configuration only takes effect for newly created sessions. After all clients writing to TiDB have reconnected, the messages written by changefeed to Kafka will no longer include the checksum for the corresponding data. ## Algorithm for checksum calculation @@ -99,6 +99,6 @@ fn checksum(columns) { > **Note:** > -> After enabling the Checksum validation feature, DECIMAL and UNSIGNED BIGINT types data will be converted to string types. Therefore, in the downstream consumer code, you need to convert them back to their corresponding numerical types before calculating Checksum values. +> After enabling the checksum validation feature, DECIMAL and UNSIGNED BIGINT types data will be converted to string types. Therefore, in the downstream consumer code, you need to convert them back to their corresponding numerical types before calculating checksum values. The consumer code written in Golang implements steps such as decoding data read from Kafka, sorting by schema fields, and calculating the checksum value. For more information, see [`avro/decoder.go`](https://github.com/pingcap/tiflow/blob/master/pkg/sink/codec/avro/decoder.go). From 1c5f07e739e7b4145d36d96210a8e1a1a53c7112 Mon Sep 17 00:00:00 2001 From: Aolin Date: Tue, 23 May 2023 13:37:19 +0800 Subject: [PATCH 13/13] update code example Signed-off-by: Aolin --- ticdc/ticdc-integrity-check.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ticdc/ticdc-integrity-check.md b/ticdc/ticdc-integrity-check.md index 1bac68fe9b7a2..1142d93548b4b 100644 --- a/ticdc/ticdc-integrity-check.md +++ b/ticdc/ticdc-integrity-check.md @@ -38,7 +38,7 @@ TiCDC disables data integrity validation by default. To enable it, perform the f 3. When using Avro as the data encoding format, you need to set [`enable-tidb-extension=true`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka) in the [`sink-uri`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka). To prevent numerical precision loss during network transmission, which can cause checksum validation failures, you also need to set [`avro-decimal-handling-mode=string`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka) and [`avro-bigint-unsigned-handling-mode=string`](/ticdc/ticdc-sink-to-kafka.md#configure-sink-uri-for-kafka). The following is an example: ```shell - cdc cli changefeed create --server=http://127.0.0.1:8300 --changefeed-id="kafka-avro-enable-extension" --sink-uri="kafka://127.0.0.1:9092/topic-name?protocol=avro&enable-tidb-extension=true&avro-decimal-handling-mode=string&avro-bigint-unsigned-handling-mode=string" --schema-registry=http://127.0.0.1:8081 --config changefeed_config.toml + cdc cli changefeed create --server=http://127.0.0.1:8300 --changefeed-id="kafka-avro-checksum" --sink-uri="kafka://127.0.0.1:9092/topic-name?protocol=avro&enable-tidb-extension=true&avro-decimal-handling-mode=string&avro-bigint-unsigned-handling-mode=string" --schema-registry=http://127.0.0.1:8081 --config changefeed_config.toml ``` With the preceding configuration, each message written to Kafka by the changefeed will include the corresponding data's checksum. You can verify data consistency based on these checksum values.