From cc0337e85739e731919927e4294abaf81caf4fe8 Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 2 Mar 2021 16:02:29 +0800 Subject: [PATCH 01/32] support restore data into tables that contains data --- pkg/lightning/backend/tidb.go | 100 +++++---- pkg/lightning/checkpoints/checkpoints.go | 56 ++++- .../checkpoints/checkpoints_file_test.go | 5 + .../checkpoints/checkpoints_sql_test.go | 15 +- .../checkpoints/file_checkpoints.pb.go | 209 ++++++++++++------ .../checkpoints/file_checkpoints.proto | 3 + pkg/lightning/mydump/region.go | 2 +- pkg/lightning/mydump/region_test.go | 2 +- pkg/lightning/restore/restore.go | 53 ++++- pkg/lightning/restore/restore_test.go | 6 +- tests/lightning_checkpoint/run.sh | 2 +- tests/lightning_checkpoint_chunks/run.sh | 2 +- 12 files changed, 330 insertions(+), 125 deletions(-) diff --git a/pkg/lightning/backend/tidb.go b/pkg/lightning/backend/tidb.go index d9f64bb13..b575a9c95 100644 --- a/pkg/lightning/backend/tidb.go +++ b/pkg/lightning/backend/tidb.go @@ -485,48 +485,23 @@ func (be *tidbBackend) FetchRemoteTableModels(ctx context.Context, schemaName st if rows.Err() != nil { return rows.Err() } - // for version < v4.0.0 we can use `show table next_row_id` to fetch auto id info, so about should be enough + // shard_row_id/auto random is only available after tidb v4.0.0 + // `show table next_row_id` is also not available before tidb v4.0.0 if tidbVersion.Major < 4 { return nil } + // init auto id column for each table for _, tbl := range tables { tblName := common.UniqueTable(schemaName, tbl.Name.O) - rows, e = tx.Query(fmt.Sprintf("SHOW TABLE %s NEXT_ROW_ID", tblName)) - if e != nil { - return e + autoIDInfos, err := FetchTableAutoIDInfos(tx, tblName) + if err != nil { + return errors.Trace(err) } - for rows.Next() { - var ( - dbName, tblName, columnName, idType string - nextID int64 - ) - columns, err := rows.Columns() - if err != nil { - return err - } - - //+--------------+------------+-------------+--------------------+----------------+ - //| DB_NAME | TABLE_NAME | COLUMN_NAME | NEXT_GLOBAL_ROW_ID | ID_TYPE | - //+--------------+------------+-------------+--------------------+----------------+ - //| testsysbench | t | _tidb_rowid | 1 | AUTO_INCREMENT | - //+--------------+------------+-------------+--------------------+----------------+ - - // if columns length is 4, it doesn't contains the last column `ID_TYPE`, and it will always be 'AUTO_INCREMENT' - // for v4.0.0~v4.0.2 show table t next_row_id only returns 4 columns. - if len(columns) == 4 { - err = rows.Scan(&dbName, &tblName, &columnName, &nextID) - idType = "AUTO_INCREMENT" - } else { - err = rows.Scan(&dbName, &tblName, &columnName, &nextID, &idType) - } - if err != nil { - return err - } - + for _, info := range autoIDInfos { for _, col := range tbl.Columns { - if col.Name.O == columnName { - switch idType { + if col.Name.O == info.Column { + switch info.Type { case "AUTO_INCREMENT": col.Flag |= mysql.AutoIncrementFlag case "AUTO_RANDOM": @@ -538,10 +513,6 @@ func (be *tidbBackend) FetchRemoteTableModels(ctx context.Context, schemaName st } } } - rows.Close() - if rows.Err() != nil { - return rows.Err() - } } return nil }) @@ -580,3 +551,56 @@ func (w *TiDBWriter) Close() error { func (w *TiDBWriter) AppendRows(ctx context.Context, tableName string, columnNames []string, arg1 uint64, rows Rows) error { return w.be.WriteRows(ctx, w.engineUUID, tableName, columnNames, arg1, rows) } + +type QueryExecutor interface { + Query(query string, args ...interface{}) (*sql.Rows, error) +} + +type TableAutoIDInfo struct { + Column string + NextID int64 + Type string +} + +func FetchTableAutoIDInfos(exec QueryExecutor, tableName string) ([]*TableAutoIDInfo, error) { + rows, e := exec.Query(fmt.Sprintf("SHOW TABLE %s NEXT_ROW_ID", tableName)) + if e != nil { + return nil, errors.Trace(e) + } + defer rows.Close() + var autoIDInfos []*TableAutoIDInfo + for rows.Next() { + var ( + dbName, tblName, columnName, idType string + nextID int64 + ) + columns, err := rows.Columns() + if err != nil { + return nil, errors.Trace(err) + } + + //+--------------+------------+-------------+--------------------+----------------+ + //| DB_NAME | TABLE_NAME | COLUMN_NAME | NEXT_GLOBAL_ROW_ID | ID_TYPE | + //+--------------+------------+-------------+--------------------+----------------+ + //| testsysbench | t | _tidb_rowid | 1 | AUTO_INCREMENT | + //+--------------+------------+-------------+--------------------+----------------+ + + // if columns length is 4, it doesn't contains the last column `ID_TYPE`, and it will always be 'AUTO_INCREMENT' + // for v4.0.0~v4.0.2 show table t next_row_id only returns 4 columns. + if len(columns) == 4 { + err = rows.Scan(&dbName, &tblName, &columnName, &nextID) + idType = "AUTO_INCREMENT" + } else { + err = rows.Scan(&dbName, &tblName, &columnName, &nextID, &idType) + } + if err != nil { + return nil, errors.Trace(err) + } + autoIDInfos = append(autoIDInfos, &TableAutoIDInfo{ + Column: columnName, + NextID: nextID, + Type: idType, + }) + } + return autoIDInfos, nil +} diff --git a/pkg/lightning/checkpoints/checkpoints.go b/pkg/lightning/checkpoints/checkpoints.go index 5d0a9935f..63a2b47e8 100644 --- a/pkg/lightning/checkpoints/checkpoints.go +++ b/pkg/lightning/checkpoints/checkpoints.go @@ -61,7 +61,7 @@ const ( // the table names to store each kind of checkpoint in the checkpoint database // remember to increase the version number in case of incompatible change. CheckpointTableNameTask = "task_v2" - CheckpointTableNameTable = "table_v6" + CheckpointTableNameTable = "table_v7" CheckpointTableNameEngine = "engine_v5" CheckpointTableNameChunk = "chunk_v5" ) @@ -92,6 +92,9 @@ const ( table_id bigint NOT NULL DEFAULT 0, create_time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, update_time timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + kv_bytes bigint unsigned NOT NULL DEFAULT 0, + kv_kvs bigint unsigned NOT NULL DEFAULT 0, + kv_checksum bigint unsigned NOT NULL DEFAULT 0, INDEX(task_id) );` CreateEngineTableTemplate = ` @@ -147,7 +150,7 @@ const ( FROM %s.%s WHERE table_name = ? ORDER BY engine_id, path, offset;` ReadTableRemainTemplate = ` - SELECT status, alloc_base, table_id FROM %s.%s WHERE table_name = ?;` + SELECT status, alloc_base, table_id, kv_bytes, kv_kvs, kv_checksum FROM %s.%s WHERE table_name = ?;` ReplaceEngineTemplate = ` REPLACE INTO %s.%s (table_name, engine_id, status) VALUES (?, ?, ?);` ReplaceChunkTemplate = ` @@ -169,7 +172,8 @@ const ( UPDATE %s.%s SET alloc_base = GREATEST(?, alloc_base) WHERE table_name = ?;` UpdateTableStatusTemplate = ` UPDATE %s.%s SET status = ? WHERE table_name = ?;` - UpdateEngineTemplate = ` + UpdateTableChecksumTemplate = `UPDATE %s.%s SET kv_bytes = ?, kv_kvs = ?, kv_checksum = ? WHERE table_name = ?;` + UpdateEngineTemplate = ` UPDATE %s.%s SET status = ? WHERE (table_name, engine_id) = (?, ?);` DeleteCheckpointRecordTemplate = "DELETE FROM %s.%s WHERE table_name = ?;" ) @@ -271,6 +275,8 @@ type TableCheckpoint struct { AllocBase int64 Engines map[int32]*EngineCheckpoint TableID int64 + // remote checksum before restore + Checksum verify.KVChecksum } func (cp *TableCheckpoint) DeepCopy() *TableCheckpoint { @@ -283,6 +289,7 @@ func (cp *TableCheckpoint) DeepCopy() *TableCheckpoint { AllocBase: cp.AllocBase, Engines: engines, TableID: cp.TableID, + Checksum: cp.Checksum, } } @@ -308,11 +315,13 @@ type engineCheckpointDiff struct { } type TableCheckpointDiff struct { - hasStatus bool - hasRebase bool - status CheckpointStatus - allocBase int64 - engines map[int32]engineCheckpointDiff + hasStatus bool + hasRebase bool + hasChecksum bool + status CheckpointStatus + allocBase int64 + engines map[int32]engineCheckpointDiff + checksum verify.KVChecksum } func NewTableCheckpointDiff() *TableCheckpointDiff { @@ -430,6 +439,15 @@ func (merger *ChunkCheckpointMerger) MergeInto(cpd *TableCheckpointDiff) { }) } +type TableChecksumMerger struct { + Checksum verify.KVChecksum +} + +func (m *TableChecksumMerger) MergeInto(cpd *TableCheckpointDiff) { + cpd.hasChecksum = true + cpd.checksum = m.Checksum +} + type RebaseCheckpointMerger struct { AllocBase int64 } @@ -768,9 +786,11 @@ func (cpdb *MySQLCheckpointsDB) Get(ctx context.Context, tableName string) (*Tab tableRow := tx.QueryRowContext(c, tableQuery, tableName) var status uint8 - if err := tableRow.Scan(&status, &cp.AllocBase, &cp.TableID); err != nil { + var kvs, bytes, checksum uint64 + if err := tableRow.Scan(&status, &cp.AllocBase, &cp.TableID, &bytes, &kvs, &checksum); err != nil { return errors.Trace(err) } + cp.Checksum = verify.MakeKVChecksum(bytes, kvs, checksum) cp.Status = CheckpointStatus(status) return nil }) @@ -834,6 +854,7 @@ func (cpdb *MySQLCheckpointsDB) Update(checkpointDiffs map[string]*TableCheckpoi chunkQuery := fmt.Sprintf(UpdateChunkTemplate, cpdb.schema, CheckpointTableNameChunk) rebaseQuery := fmt.Sprintf(UpdateTableRebaseTemplate, cpdb.schema, CheckpointTableNameTable) tableStatusQuery := fmt.Sprintf(UpdateTableStatusTemplate, cpdb.schema, CheckpointTableNameTable) + tableChecksumQuery := fmt.Sprintf(UpdateTableChecksumTemplate, cpdb.schema, CheckpointTableNameTable) engineStatusQuery := fmt.Sprintf(UpdateEngineTemplate, cpdb.schema, CheckpointTableNameEngine) s := common.SQLWithRetry{DB: cpdb.db, Logger: log.L()} @@ -853,12 +874,16 @@ func (cpdb *MySQLCheckpointsDB) Update(checkpointDiffs map[string]*TableCheckpoi return errors.Trace(e) } defer tableStatusStmt.Close() + tableChecksumStmt, e := tx.PrepareContext(c, tableChecksumQuery) + if e != nil { + return errors.Trace(e) + } + defer tableChecksumStmt.Close() engineStatusStmt, e := tx.PrepareContext(c, engineStatusQuery) if e != nil { return errors.Trace(e) } defer engineStatusStmt.Close() - for tableName, cpd := range checkpointDiffs { if cpd.hasStatus { if _, e := tableStatusStmt.ExecContext(c, cpd.status, tableName); e != nil { @@ -870,6 +895,11 @@ func (cpdb *MySQLCheckpointsDB) Update(checkpointDiffs map[string]*TableCheckpoi return errors.Trace(e) } } + if cpd.hasChecksum { + if _, e := tableChecksumStmt.ExecContext(c, cpd.checksum.SumSize(), cpd.checksum.SumKVS(), cpd.checksum.Sum(), tableName); e != nil { + return errors.Trace(e) + } + } for engineID, engineDiff := range cpd.engines { if engineDiff.hasStatus { if _, e := engineStatusStmt.ExecContext(c, engineDiff.status, tableName, engineID); e != nil { @@ -1033,6 +1063,7 @@ func (cpdb *FileCheckpointsDB) Get(_ context.Context, tableName string) (*TableC AllocBase: tableModel.AllocBase, Engines: make(map[int32]*EngineCheckpoint, len(tableModel.Engines)), TableID: tableModel.TableID, + Checksum: verify.MakeKVChecksum(tableModel.KvBytes, tableModel.KvKvs, tableModel.KvChecksum), } for engineID, engineModel := range tableModel.Engines { @@ -1132,6 +1163,11 @@ func (cpdb *FileCheckpointsDB) Update(checkpointDiffs map[string]*TableCheckpoin if cpd.hasRebase { tableModel.AllocBase = cpd.allocBase } + if cpd.hasChecksum { + tableModel.KvBytes = cpd.checksum.SumSize() + tableModel.KvKvs = cpd.checksum.SumKVS() + tableModel.KvChecksum = cpd.checksum.Sum() + } for engineID, engineDiff := range cpd.engines { engineModel := tableModel.Engines[engineID] if engineDiff.hasStatus { diff --git a/pkg/lightning/checkpoints/checkpoints_file_test.go b/pkg/lightning/checkpoints/checkpoints_file_test.go index e49a9d738..c857eef52 100644 --- a/pkg/lightning/checkpoints/checkpoints_file_test.go +++ b/pkg/lightning/checkpoints/checkpoints_file_test.go @@ -118,6 +118,10 @@ func (s *cpFileSuite) SetUpTest(c *C) { AllocBase: 132861, } rcm.MergeInto(cpd) + cksum := checkpoints.TableChecksumMerger{ + Checksum: verification.MakeKVChecksum(4492, 686, 486070148910), + } + cksum.MergeInto(cpd) ccm := checkpoints.ChunkCheckpointMerger{ EngineID: 0, Key: checkpoints.ChunkCheckpointKey{Path: "/tmp/path/1.sql", Offset: 0}, @@ -159,6 +163,7 @@ func (s *cpFileSuite) TestGet(c *C) { c.Assert(cp, DeepEquals, &checkpoints.TableCheckpoint{ Status: checkpoints.CheckpointStatusAllWritten, AllocBase: 132861, + Checksum: verification.MakeKVChecksum(4492, 686, 486070148910), Engines: map[int32]*checkpoints.EngineCheckpoint{ -1: { Status: checkpoints.CheckpointStatusLoaded, diff --git a/pkg/lightning/checkpoints/checkpoints_sql_test.go b/pkg/lightning/checkpoints/checkpoints_sql_test.go index fdf000182..6150c00a0 100644 --- a/pkg/lightning/checkpoints/checkpoints_sql_test.go +++ b/pkg/lightning/checkpoints/checkpoints_sql_test.go @@ -174,6 +174,10 @@ func (s *cpSQLSuite) TestNormalOperations(c *C) { AllocBase: 132861, } rcm.MergeInto(cpd) + cksum := checkpoints.TableChecksumMerger{ + Checksum: verification.MakeKVChecksum(4492, 686, 486070148910), + } + cksum.MergeInto(cpd) ccm := checkpoints.ChunkCheckpointMerger{ EngineID: 0, Key: checkpoints.ChunkCheckpointKey{Path: "/tmp/path/1.sql", Offset: 0}, @@ -207,6 +211,12 @@ func (s *cpSQLSuite) TestNormalOperations(c *C) { ExpectExec(). WithArgs(60, "`db1`.`t2`"). WillReturnResult(sqlmock.NewResult(14, 1)) + s.mock. + ExpectPrepare("UPDATE `mock-schema`\\.table_v\\d+ SET kv_bytes = .+"). + ExpectExec(). + WithArgs(4492, 686, 486070148910, "`db1`.`t2`"). + WillReturnResult(sqlmock.NewResult(15, 1)) + s.mock.ExpectCommit() s.mock.MatchExpectationsInOrder(false) @@ -244,8 +254,8 @@ func (s *cpSQLSuite) TestNormalOperations(c *C) { ExpectQuery("SELECT .+ FROM `mock-schema`\\.table_v\\d+"). WithArgs("`db1`.`t2`"). WillReturnRows( - sqlmock.NewRows([]string{"status", "alloc_base", "table_id"}). - AddRow(60, 132861, int64(2)), + sqlmock.NewRows([]string{"status", "alloc_base", "table_id", "kv_bytes", "kv_kvs", "kv_checksum"}). + AddRow(60, 132861, int64(2), uint64(4492), uint64(686), uint64(486070148910)), ) s.mock.ExpectCommit() @@ -281,6 +291,7 @@ func (s *cpSQLSuite) TestNormalOperations(c *C) { }}, }, }, + Checksum: verification.MakeKVChecksum(4492, 686, 486070148910), }) c.Assert(s.mock.ExpectationsWereMet(), IsNil) } diff --git a/pkg/lightning/checkpoints/file_checkpoints.pb.go b/pkg/lightning/checkpoints/file_checkpoints.pb.go index a70ae492e..35646cde2 100644 --- a/pkg/lightning/checkpoints/file_checkpoints.pb.go +++ b/pkg/lightning/checkpoints/file_checkpoints.pb.go @@ -34,7 +34,7 @@ func (m *CheckpointsModel) Reset() { *m = CheckpointsModel{} } func (m *CheckpointsModel) String() string { return proto.CompactTextString(m) } func (*CheckpointsModel) ProtoMessage() {} func (*CheckpointsModel) Descriptor() ([]byte, []int) { - return fileDescriptor_5c42c6f4a8e50df4, []int{0} + return fileDescriptor_192d25c2b79ee97d, []int{0} } func (m *CheckpointsModel) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -79,7 +79,7 @@ func (m *TaskCheckpointModel) Reset() { *m = TaskCheckpointModel{} } func (m *TaskCheckpointModel) String() string { return proto.CompactTextString(m) } func (*TaskCheckpointModel) ProtoMessage() {} func (*TaskCheckpointModel) Descriptor() ([]byte, []int) { - return fileDescriptor_5c42c6f4a8e50df4, []int{1} + return fileDescriptor_192d25c2b79ee97d, []int{1} } func (m *TaskCheckpointModel) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -109,18 +109,21 @@ func (m *TaskCheckpointModel) XXX_DiscardUnknown() { var xxx_messageInfo_TaskCheckpointModel proto.InternalMessageInfo type TableCheckpointModel struct { - Hash []byte `protobuf:"bytes,1,opt,name=hash,proto3" json:"hash,omitempty"` - Status uint32 `protobuf:"varint,3,opt,name=status,proto3" json:"status,omitempty"` - AllocBase int64 `protobuf:"varint,4,opt,name=alloc_base,json=allocBase,proto3" json:"alloc_base,omitempty"` - Engines map[int32]*EngineCheckpointModel `protobuf:"bytes,8,rep,name=engines,proto3" json:"engines,omitempty" protobuf_key:"zigzag32,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` - TableID int64 `protobuf:"varint,9,opt,name=tableID,proto3" json:"tableID,omitempty"` + Hash []byte `protobuf:"bytes,1,opt,name=hash,proto3" json:"hash,omitempty"` + Status uint32 `protobuf:"varint,3,opt,name=status,proto3" json:"status,omitempty"` + AllocBase int64 `protobuf:"varint,4,opt,name=alloc_base,json=allocBase,proto3" json:"alloc_base,omitempty"` + Engines map[int32]*EngineCheckpointModel `protobuf:"bytes,8,rep,name=engines,proto3" json:"engines,omitempty" protobuf_key:"zigzag32,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` + TableID int64 `protobuf:"varint,9,opt,name=tableID,proto3" json:"tableID,omitempty"` + KvBytes uint64 `protobuf:"varint,10,opt,name=kv_bytes,json=kvBytes,proto3" json:"kv_bytes,omitempty"` + KvKvs uint64 `protobuf:"varint,11,opt,name=kv_kvs,json=kvKvs,proto3" json:"kv_kvs,omitempty"` + KvChecksum uint64 `protobuf:"fixed64,12,opt,name=kv_checksum,json=kvChecksum,proto3" json:"kv_checksum,omitempty"` } func (m *TableCheckpointModel) Reset() { *m = TableCheckpointModel{} } func (m *TableCheckpointModel) String() string { return proto.CompactTextString(m) } func (*TableCheckpointModel) ProtoMessage() {} func (*TableCheckpointModel) Descriptor() ([]byte, []int) { - return fileDescriptor_5c42c6f4a8e50df4, []int{2} + return fileDescriptor_192d25c2b79ee97d, []int{2} } func (m *TableCheckpointModel) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -159,7 +162,7 @@ func (m *EngineCheckpointModel) Reset() { *m = EngineCheckpointModel{} } func (m *EngineCheckpointModel) String() string { return proto.CompactTextString(m) } func (*EngineCheckpointModel) ProtoMessage() {} func (*EngineCheckpointModel) Descriptor() ([]byte, []int) { - return fileDescriptor_5c42c6f4a8e50df4, []int{3} + return fileDescriptor_192d25c2b79ee97d, []int{3} } func (m *EngineCheckpointModel) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -210,7 +213,7 @@ func (m *ChunkCheckpointModel) Reset() { *m = ChunkCheckpointModel{} } func (m *ChunkCheckpointModel) String() string { return proto.CompactTextString(m) } func (*ChunkCheckpointModel) ProtoMessage() {} func (*ChunkCheckpointModel) Descriptor() ([]byte, []int) { - return fileDescriptor_5c42c6f4a8e50df4, []int{4} + return fileDescriptor_192d25c2b79ee97d, []int{4} } func (m *ChunkCheckpointModel) XXX_Unmarshal(b []byte) error { return m.Unmarshal(b) @@ -251,63 +254,66 @@ func init() { } func init() { - proto.RegisterFile("pkg/lightning/checkpoints/file_checkpoints.proto", fileDescriptor_5c42c6f4a8e50df4) + proto.RegisterFile("pkg/lightning/checkpoints/file_checkpoints.proto", fileDescriptor_192d25c2b79ee97d) } -var fileDescriptor_5c42c6f4a8e50df4 = []byte{ - // 823 bytes of a gzipped FileDescriptorProto - 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x54, 0x4f, 0x6f, 0x23, 0x35, - 0x14, 0xef, 0x64, 0x9a, 0x34, 0x71, 0x92, 0x6e, 0x6a, 0xba, 0x8b, 0x29, 0x10, 0x85, 0x2c, 0x87, - 0x20, 0xd8, 0x54, 0x5a, 0x2e, 0xa8, 0x82, 0x03, 0x6d, 0x57, 0x62, 0x55, 0xad, 0xa8, 0xcc, 0xc2, - 0x81, 0xcb, 0xc8, 0x19, 0xbb, 0x99, 0xd1, 0xfc, 0xf1, 0xc8, 0xf6, 0x0c, 0x9b, 0xfd, 0x0e, 0x48, - 0x7c, 0x0c, 0xbe, 0x04, 0xf7, 0x15, 0xa7, 0x3d, 0x72, 0x84, 0xf6, 0xce, 0x67, 0x40, 0x7e, 0x9e, - 0x36, 0x93, 0x55, 0x54, 0x71, 0x7b, 0xef, 0xf7, 0x7e, 0xef, 0xe7, 0xf7, 0x9e, 0x9f, 0x8d, 0x4e, - 0x8a, 0x64, 0x79, 0x9c, 0xc6, 0xcb, 0xc8, 0xe4, 0x71, 0xde, 0xb4, 0xc2, 0x48, 0x84, 0x49, 0x21, - 0xe3, 0xdc, 0xe8, 0xe3, 0xab, 0x38, 0x15, 0x41, 0x03, 0x98, 0x17, 0x4a, 0x1a, 0x79, 0xf4, 0x64, - 0x19, 0x9b, 0xa8, 0x5c, 0xcc, 0x43, 0x99, 0x1d, 0x2f, 0xe5, 0x52, 0x1e, 0x03, 0xbc, 0x28, 0xaf, - 0xc0, 0x03, 0x07, 0x2c, 0x47, 0x9f, 0xfe, 0xeb, 0xa1, 0xd1, 0xd9, 0x5a, 0xe4, 0x85, 0xe4, 0x22, - 0xc5, 0xe7, 0xa8, 0xdf, 0x10, 0x26, 0xde, 0xc4, 0x9f, 0xf5, 0x9f, 0x4e, 0xe7, 0xef, 0xf2, 0x9a, - 0xc0, 0xb3, 0xdc, 0xa8, 0x15, 0x6d, 0xa6, 0xe1, 0x6f, 0xd0, 0x03, 0xc3, 0x74, 0xd2, 0xa8, 0x91, - 0xb4, 0x26, 0xde, 0xac, 0xff, 0xf4, 0x70, 0xfe, 0x92, 0xe9, 0x64, 0x9d, 0x0c, 0x62, 0x74, 0xdf, - 0x6c, 0x80, 0x47, 0x3f, 0x6e, 0x14, 0x06, 0xfa, 0x78, 0x84, 0xfc, 0x44, 0xac, 0x88, 0x37, 0xf1, - 0x66, 0x3d, 0x6a, 0x4d, 0xfc, 0x39, 0x6a, 0x57, 0x2c, 0x2d, 0x45, 0x2d, 0xfd, 0x70, 0xfe, 0x92, - 0x2d, 0x52, 0xf1, 0xae, 0xb6, 0xe3, 0x9c, 0xb4, 0xbe, 0xf2, 0xa6, 0xbf, 0xb7, 0xd0, 0x7b, 0x5b, - 0x8e, 0xc7, 0xef, 0xa3, 0x3d, 0xa8, 0x36, 0xe6, 0x20, 0xef, 0xd3, 0x8e, 0x75, 0x9f, 0x73, 0xfc, - 0x31, 0x42, 0x5a, 0x96, 0x2a, 0x14, 0x01, 0x8f, 0x15, 0x1c, 0xd3, 0xa3, 0x3d, 0x87, 0x9c, 0xc7, - 0x0a, 0x13, 0xb4, 0xb7, 0x60, 0x61, 0x22, 0x72, 0x4e, 0x7c, 0x88, 0xdd, 0xba, 0xf8, 0x31, 0x1a, - 0xc6, 0x59, 0x21, 0x95, 0x11, 0x2a, 0x60, 0x9c, 0x2b, 0xb2, 0x0b, 0xf1, 0xc1, 0x2d, 0xf8, 0x2d, - 0xe7, 0x0a, 0x7f, 0x88, 0x7a, 0x26, 0xe6, 0x8b, 0x20, 0x92, 0xda, 0x90, 0x36, 0x10, 0xba, 0x16, - 0xf8, 0x4e, 0x6a, 0x73, 0x17, 0xb4, 0x7c, 0xd2, 0x99, 0x78, 0xb3, 0xb6, 0x0b, 0x5e, 0x4a, 0x65, - 0x6c, 0xc1, 0x05, 0x77, 0xc2, 0x7b, 0x90, 0xd7, 0x29, 0x38, 0x48, 0x4e, 0xd1, 0x50, 0xdb, 0x03, - 0x78, 0x90, 0x54, 0x50, 0x73, 0x17, 0xc2, 0x7d, 0x07, 0x5e, 0x54, 0xb6, 0xea, 0xc7, 0x68, 0x78, - 0xb7, 0x55, 0x41, 0x25, 0x14, 0xe9, 0xb9, 0xda, 0xee, 0xc0, 0x9f, 0x84, 0x9a, 0xfe, 0xda, 0x42, - 0x87, 0xdb, 0xc6, 0x89, 0x31, 0xda, 0x8d, 0x98, 0x8e, 0x60, 0x50, 0x03, 0x0a, 0x36, 0x7e, 0x84, - 0x3a, 0xda, 0x30, 0x53, 0x6a, 0x18, 0xc3, 0x90, 0xd6, 0x9e, 0x1d, 0x1f, 0x4b, 0x53, 0x19, 0x06, - 0x0b, 0xa6, 0x05, 0x8c, 0xc0, 0xa7, 0x3d, 0x40, 0x4e, 0x99, 0x16, 0xf8, 0x6b, 0xb4, 0x27, 0xf2, - 0x65, 0x9c, 0x0b, 0x4d, 0xba, 0xf5, 0x9a, 0x6d, 0x3b, 0x72, 0xfe, 0xcc, 0x91, 0xdc, 0x9a, 0xdd, - 0xa6, 0xd8, 0xe1, 0x1b, 0xcb, 0x7e, 0x7e, 0x0e, 0x0d, 0xf8, 0xf4, 0xd6, 0x3d, 0xa2, 0x68, 0xd0, - 0x4c, 0x69, 0x6e, 0xce, 0x81, 0xdb, 0x9c, 0x2f, 0x36, 0x37, 0xe7, 0x51, 0x7d, 0xc4, 0x3d, 0xab, - 0xf3, 0x87, 0x87, 0x1e, 0x6e, 0x25, 0x35, 0x9a, 0xf7, 0x36, 0x9a, 0x3f, 0x41, 0x9d, 0x30, 0x2a, - 0xf3, 0x44, 0x93, 0x56, 0xdd, 0xdc, 0xd6, 0xfc, 0xf9, 0x19, 0x90, 0x5c, 0x73, 0x75, 0xc6, 0xd1, - 0x25, 0xea, 0x37, 0xe0, 0xff, 0xb3, 0xfa, 0x40, 0xbf, 0xa7, 0xfe, 0x3f, 0x7d, 0x74, 0xb8, 0x8d, - 0x63, 0xef, 0xb3, 0x60, 0x26, 0xaa, 0xc5, 0xc1, 0xb6, 0x2d, 0xc9, 0xab, 0x2b, 0x2d, 0xdc, 0xa3, - 0xf5, 0x69, 0xed, 0xe1, 0x27, 0x08, 0x87, 0x32, 0x2d, 0xb3, 0x3c, 0x28, 0x84, 0xca, 0x4a, 0xc3, - 0x4c, 0x2c, 0x73, 0x32, 0x98, 0xf8, 0xb3, 0x36, 0x3d, 0x70, 0x91, 0xcb, 0x75, 0xc0, 0x5e, 0xbf, - 0xc8, 0x79, 0x50, 0x4b, 0xb5, 0xdd, 0xf5, 0x8b, 0x9c, 0x7f, 0xef, 0xd4, 0x46, 0xc8, 0x2f, 0xa4, - 0x86, 0xdd, 0xf6, 0xa9, 0x35, 0xf1, 0xa7, 0x68, 0xbf, 0x50, 0xa2, 0x0a, 0x94, 0xfc, 0x25, 0xe6, - 0x41, 0xc6, 0x5e, 0xc1, 0x76, 0xfb, 0x74, 0x60, 0x51, 0x6a, 0xc1, 0x17, 0xec, 0x95, 0x7d, 0x19, - 0x6b, 0x42, 0x17, 0x08, 0x5d, 0xd5, 0x08, 0x26, 0x55, 0x18, 0x2c, 0x56, 0x46, 0x68, 0xd8, 0x8b, - 0x5d, 0xda, 0x4d, 0xaa, 0xf0, 0xd4, 0xfa, 0xf6, 0xd9, 0xd8, 0x60, 0x52, 0x69, 0x82, 0x20, 0xd4, - 0x49, 0xaa, 0xf0, 0xa2, 0xd2, 0xf8, 0x13, 0x34, 0xb0, 0x01, 0xf8, 0xad, 0x74, 0x99, 0x91, 0xfe, - 0xc4, 0x9b, 0x75, 0x68, 0x3f, 0xa9, 0xc2, 0xb3, 0x1a, 0xc2, 0x1f, 0xd9, 0xf7, 0x98, 0x09, 0x6d, - 0x58, 0x56, 0x90, 0xe1, 0xc4, 0x9b, 0x8d, 0xe8, 0x1a, 0xb0, 0x53, 0x34, 0xab, 0x42, 0x90, 0x7d, - 0x78, 0xa8, 0x60, 0xe3, 0x09, 0xea, 0x87, 0x32, 0x2b, 0x94, 0xd0, 0xda, 0x8e, 0xe9, 0x01, 0x84, - 0x9a, 0x10, 0xfe, 0x00, 0x75, 0xed, 0xc3, 0x0c, 0xec, 0xe5, 0x8e, 0xdc, 0x07, 0x62, 0xfd, 0x0b, - 0xb1, 0xb2, 0x7d, 0xc0, 0x27, 0xaf, 0xe3, 0xd7, 0x82, 0x1c, 0xb8, 0x26, 0x2d, 0xf0, 0x43, 0xfc, - 0x5a, 0x9c, 0x7e, 0xf6, 0xe6, 0x9f, 0xf1, 0xce, 0x9b, 0xeb, 0xb1, 0xf7, 0xf6, 0x7a, 0xec, 0xfd, - 0x7d, 0x3d, 0xf6, 0x7e, 0xbb, 0x19, 0xef, 0xbc, 0xbd, 0x19, 0xef, 0xfc, 0x75, 0x33, 0xde, 0xf9, - 0xb9, 0xf9, 0x11, 0x2f, 0x3a, 0xf0, 0xd5, 0x7f, 0xf9, 0x5f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xc2, - 0xdb, 0x56, 0x3e, 0x57, 0x06, 0x00, 0x00, +var fileDescriptor_192d25c2b79ee97d = []byte{ + // 867 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x8c, 0x55, 0xcf, 0x6e, 0xdb, 0xc6, + 0x13, 0x36, 0x45, 0xeb, 0xdf, 0x50, 0x72, 0xe4, 0xfd, 0xd9, 0xc9, 0xfe, 0xdc, 0x56, 0x55, 0x95, + 0x1e, 0x54, 0xb4, 0x91, 0x8a, 0xf4, 0x52, 0x04, 0xed, 0xa1, 0xb6, 0x03, 0x34, 0x30, 0x82, 0x1a, + 0xdb, 0xb4, 0x87, 0x5e, 0x08, 0x8a, 0x5c, 0x4b, 0xc4, 0x8a, 0x5c, 0x82, 0xbb, 0x64, 0xa3, 0x3c, + 0x45, 0x1f, 0xa3, 0x2f, 0xd1, 0x7b, 0xd0, 0x53, 0x8e, 0x3d, 0xa6, 0xf6, 0xbd, 0xcf, 0x50, 0xec, + 0x2c, 0x65, 0xd1, 0x81, 0x10, 0xf4, 0xb6, 0xf3, 0xcd, 0x37, 0xb3, 0xb3, 0x9f, 0xbe, 0xa1, 0xe0, + 0xcb, 0x4c, 0x2c, 0x66, 0xab, 0x78, 0xb1, 0xd4, 0x69, 0x9c, 0x2e, 0x66, 0xe1, 0x92, 0x87, 0x22, + 0x93, 0x71, 0xaa, 0xd5, 0xec, 0x2a, 0x5e, 0x71, 0xbf, 0x06, 0x4c, 0xb3, 0x5c, 0x6a, 0x79, 0xf2, + 0x68, 0x11, 0xeb, 0x65, 0x31, 0x9f, 0x86, 0x32, 0x99, 0x2d, 0xe4, 0x42, 0xce, 0x10, 0x9e, 0x17, + 0x57, 0x18, 0x61, 0x80, 0x27, 0x4b, 0x1f, 0xff, 0xe3, 0xc0, 0xe0, 0x6c, 0xdb, 0xe4, 0xb9, 0x8c, + 0xf8, 0x8a, 0x9c, 0x83, 0x57, 0x6b, 0x4c, 0x9d, 0x91, 0x3b, 0xf1, 0x1e, 0x8f, 0xa7, 0xef, 0xf2, + 0xea, 0xc0, 0xd3, 0x54, 0xe7, 0x6b, 0x56, 0x2f, 0x23, 0xdf, 0xc2, 0x3d, 0x1d, 0x28, 0x51, 0x9b, + 0x91, 0x36, 0x46, 0xce, 0xc4, 0x7b, 0x7c, 0x34, 0x7d, 0x11, 0x28, 0xb1, 0x2d, 0xc6, 0x66, 0xec, + 0x40, 0xdf, 0x01, 0x4f, 0x7e, 0xba, 0x33, 0x18, 0xf6, 0x27, 0x03, 0x70, 0x05, 0x5f, 0x53, 0x67, + 0xe4, 0x4c, 0xba, 0xcc, 0x1c, 0xc9, 0xe7, 0xd0, 0x2c, 0x83, 0x55, 0xc1, 0xab, 0xd6, 0xc7, 0xd3, + 0x17, 0xc1, 0x7c, 0xc5, 0xdf, 0xed, 0x6d, 0x39, 0x4f, 0x1a, 0x5f, 0x3b, 0xe3, 0xdf, 0x1b, 0xf0, + 0xbf, 0x1d, 0xd7, 0x93, 0x07, 0xd0, 0xc6, 0x69, 0xe3, 0x08, 0xdb, 0xbb, 0xac, 0x65, 0xc2, 0x67, + 0x11, 0xf9, 0x08, 0x40, 0xc9, 0x22, 0x0f, 0xb9, 0x1f, 0xc5, 0x39, 0x5e, 0xd3, 0x65, 0x5d, 0x8b, + 0x9c, 0xc7, 0x39, 0xa1, 0xd0, 0x9e, 0x07, 0xa1, 0xe0, 0x69, 0x44, 0x5d, 0xcc, 0x6d, 0x42, 0xf2, + 0x10, 0xfa, 0x71, 0x92, 0xc9, 0x5c, 0xf3, 0xdc, 0x0f, 0xa2, 0x28, 0xa7, 0xfb, 0x98, 0xef, 0x6d, + 0xc0, 0xef, 0xa2, 0x28, 0x27, 0x1f, 0x40, 0x57, 0xc7, 0xd1, 0xdc, 0x5f, 0x4a, 0xa5, 0x69, 0x13, + 0x09, 0x1d, 0x03, 0x7c, 0x2f, 0x95, 0xbe, 0x4d, 0x1a, 0x3e, 0x6d, 0x8d, 0x9c, 0x49, 0xd3, 0x26, + 0x2f, 0x65, 0xae, 0xcd, 0xc0, 0x59, 0x64, 0x1b, 0xb7, 0xb1, 0xae, 0x95, 0x45, 0xd8, 0x72, 0x0c, + 0x7d, 0x65, 0x2e, 0x88, 0x7c, 0x51, 0xe2, 0xcc, 0x1d, 0x4c, 0x7b, 0x16, 0xbc, 0x28, 0xcd, 0xd4, + 0x0f, 0xa1, 0x7f, 0xeb, 0x2a, 0xbf, 0xe4, 0x39, 0xed, 0xda, 0xd9, 0x6e, 0xc1, 0x9f, 0x79, 0x3e, + 0x7e, 0xdb, 0x80, 0xa3, 0x5d, 0x72, 0x12, 0x02, 0xfb, 0xcb, 0x40, 0x2d, 0x51, 0xa8, 0x1e, 0xc3, + 0x33, 0xb9, 0x0f, 0x2d, 0xa5, 0x03, 0x5d, 0x28, 0x94, 0xa1, 0xcf, 0xaa, 0xc8, 0xc8, 0x17, 0xac, + 0x56, 0x32, 0xf4, 0xe7, 0x81, 0xe2, 0x28, 0x81, 0xcb, 0xba, 0x88, 0x9c, 0x06, 0x8a, 0x93, 0x6f, + 0xa0, 0xcd, 0xd3, 0x45, 0x9c, 0x72, 0x45, 0x3b, 0x95, 0xcd, 0x76, 0x5d, 0x39, 0x7d, 0x6a, 0x49, + 0xd6, 0x66, 0x9b, 0x12, 0x23, 0xbe, 0x36, 0xec, 0x67, 0xe7, 0xf8, 0x00, 0x97, 0x6d, 0x42, 0xf2, + 0x7f, 0xe8, 0x88, 0xd2, 0x9f, 0xaf, 0x35, 0x57, 0x14, 0x46, 0xce, 0x64, 0x9f, 0xb5, 0x45, 0x79, + 0x6a, 0x42, 0x72, 0x0c, 0x2d, 0x51, 0xfa, 0xa2, 0x54, 0xd4, 0xc3, 0x44, 0x53, 0x94, 0x17, 0xa5, + 0x22, 0x1f, 0x83, 0x27, 0x4a, 0x6b, 0x56, 0x55, 0x24, 0xb4, 0x37, 0x72, 0x26, 0x2d, 0x06, 0xa2, + 0x3c, 0xab, 0x90, 0x13, 0x06, 0xbd, 0xfa, 0x14, 0x75, 0x33, 0x1e, 0x5a, 0x33, 0x7e, 0x71, 0xd7, + 0x8c, 0xf7, 0xab, 0xa9, 0xdf, 0xe3, 0xc6, 0x3f, 0x1c, 0x38, 0xde, 0x49, 0xaa, 0xe9, 0xe9, 0xdc, + 0xd1, 0xf3, 0x09, 0xb4, 0xc2, 0x65, 0x91, 0x0a, 0x45, 0x1b, 0x95, 0x5e, 0x3b, 0xeb, 0xa7, 0x67, + 0x48, 0xb2, 0x7a, 0x55, 0x15, 0x27, 0x97, 0xe0, 0xd5, 0xe0, 0xff, 0xb2, 0x4d, 0x48, 0x7f, 0xcf, + 0xfc, 0x7f, 0xba, 0x70, 0xb4, 0x8b, 0x63, 0x2c, 0x92, 0x05, 0x7a, 0x59, 0x35, 0xc7, 0xb3, 0x79, + 0x92, 0xbc, 0xba, 0x52, 0xdc, 0x7e, 0x07, 0x5c, 0x56, 0x45, 0xe4, 0x11, 0x90, 0x50, 0xae, 0x8a, + 0x24, 0xf5, 0x33, 0x9e, 0x27, 0x85, 0x0e, 0x74, 0x2c, 0x53, 0xda, 0x1b, 0xb9, 0x93, 0x26, 0x3b, + 0xb4, 0x99, 0xcb, 0x6d, 0xc2, 0x38, 0x8a, 0xa7, 0x91, 0x5f, 0xb5, 0x6a, 0x5a, 0x47, 0xf1, 0x34, + 0xfa, 0xc1, 0x76, 0x1b, 0x80, 0x9b, 0x49, 0x85, 0xeb, 0xe2, 0x32, 0x73, 0x24, 0x9f, 0xc2, 0x41, + 0x96, 0xf3, 0xd2, 0xcf, 0xe5, 0xaf, 0x71, 0xe4, 0x27, 0xc1, 0x4b, 0x5c, 0x18, 0x97, 0xf5, 0x0c, + 0xca, 0x0c, 0xf8, 0x3c, 0x78, 0x69, 0x96, 0x6d, 0x4b, 0xe8, 0x20, 0xa1, 0x93, 0xd7, 0x92, 0xa2, + 0x0c, 0x2b, 0x3f, 0x75, 0xd1, 0x36, 0x1d, 0x51, 0x86, 0xd6, 0x50, 0x0f, 0xa0, 0x6d, 0x92, 0xc6, + 0x51, 0xd6, 0x6a, 0x2d, 0x51, 0x86, 0xc6, 0x52, 0x9f, 0x40, 0xcf, 0x24, 0x6e, 0x3d, 0xe5, 0xa1, + 0xa7, 0x3c, 0x51, 0x86, 0x1b, 0x53, 0x91, 0x0f, 0xcd, 0x8a, 0x27, 0x5c, 0xe9, 0x20, 0xc9, 0x68, + 0x7f, 0xe4, 0x4c, 0x06, 0x6c, 0x0b, 0x18, 0x15, 0xf5, 0x3a, 0xe3, 0xf4, 0x00, 0x77, 0x1f, 0xcf, + 0x64, 0x04, 0x5e, 0x28, 0x93, 0x2c, 0xe7, 0x4a, 0x19, 0x99, 0xee, 0x61, 0xaa, 0x0e, 0x19, 0xef, + 0x9b, 0x5d, 0xf7, 0xcd, 0x8f, 0x3b, 0xb0, 0xdf, 0x24, 0x13, 0x5f, 0xf0, 0xb5, 0x79, 0x07, 0xfe, + 0x6f, 0xa8, 0xf8, 0x15, 0xa7, 0x87, 0xf6, 0x91, 0x06, 0xf8, 0x31, 0x7e, 0xc5, 0x4f, 0x3f, 0x7b, + 0xfd, 0xf7, 0x70, 0xef, 0xf5, 0xf5, 0xd0, 0x79, 0x73, 0x3d, 0x74, 0xde, 0x5e, 0x0f, 0x9d, 0xdf, + 0x6e, 0x86, 0x7b, 0x6f, 0x6e, 0x86, 0x7b, 0x7f, 0xdd, 0x0c, 0xf7, 0x7e, 0xa9, 0x7f, 0xdb, 0xe7, + 0x2d, 0xfc, 0xf7, 0xf8, 0xea, 0xdf, 0x00, 0x00, 0x00, 0xff, 0xff, 0x5f, 0xdb, 0x31, 0x8d, 0xa0, + 0x06, 0x00, 0x00, } func (m *CheckpointsModel) Marshal() (dAtA []byte, err error) { @@ -473,6 +479,22 @@ func (m *TableCheckpointModel) MarshalToSizedBuffer(dAtA []byte) (int, error) { _ = i var l int _ = l + if m.KvChecksum != 0 { + i -= 8 + encoding_binary.LittleEndian.PutUint64(dAtA[i:], uint64(m.KvChecksum)) + i-- + dAtA[i] = 0x61 + } + if m.KvKvs != 0 { + i = encodeVarintFileCheckpoints(dAtA, i, uint64(m.KvKvs)) + i-- + dAtA[i] = 0x58 + } + if m.KvBytes != 0 { + i = encodeVarintFileCheckpoints(dAtA, i, uint64(m.KvBytes)) + i-- + dAtA[i] = 0x50 + } if m.TableID != 0 { i = encodeVarintFileCheckpoints(dAtA, i, uint64(m.TableID)) i-- @@ -810,6 +832,15 @@ func (m *TableCheckpointModel) Size() (n int) { if m.TableID != 0 { n += 1 + sovFileCheckpoints(uint64(m.TableID)) } + if m.KvBytes != 0 { + n += 1 + sovFileCheckpoints(uint64(m.KvBytes)) + } + if m.KvKvs != 0 { + n += 1 + sovFileCheckpoints(uint64(m.KvKvs)) + } + if m.KvChecksum != 0 { + n += 9 + } return n } @@ -1675,6 +1706,54 @@ func (m *TableCheckpointModel) Unmarshal(dAtA []byte) error { break } } + case 10: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field KvBytes", wireType) + } + m.KvBytes = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowFileCheckpoints + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.KvBytes |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 11: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field KvKvs", wireType) + } + m.KvKvs = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowFileCheckpoints + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.KvKvs |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 12: + if wireType != 1 { + return fmt.Errorf("proto: wrong wireType = %d for field KvChecksum", wireType) + } + m.KvChecksum = 0 + if (iNdEx + 8) > l { + return io.ErrUnexpectedEOF + } + m.KvChecksum = uint64(encoding_binary.LittleEndian.Uint64(dAtA[iNdEx:])) + iNdEx += 8 default: iNdEx = preIndex skippy, err := skipFileCheckpoints(dAtA[iNdEx:]) diff --git a/pkg/lightning/checkpoints/file_checkpoints.proto b/pkg/lightning/checkpoints/file_checkpoints.proto index 82166877f..32830a054 100644 --- a/pkg/lightning/checkpoints/file_checkpoints.proto +++ b/pkg/lightning/checkpoints/file_checkpoints.proto @@ -42,6 +42,9 @@ message TableCheckpointModel { int64 alloc_base = 4; map engines = 8; int64 tableID = 9; + uint64 kv_bytes = 10; + uint64 kv_kvs = 11; + fixed64 kv_checksum = 12; } message EngineCheckpointModel { diff --git a/pkg/lightning/mydump/region.go b/pkg/lightning/mydump/region.go index 9d7f1fb51..82033eb61 100644 --- a/pkg/lightning/mydump/region.go +++ b/pkg/lightning/mydump/region.go @@ -138,6 +138,7 @@ func MakeTableRegions( cfg *config.Config, ioWorkers *worker.Pool, store storage.ExternalStorage, + prevRowIDMax int64, ) ([]*TableRegion, error) { // Split files into regions type fileRegionRes struct { @@ -210,7 +211,6 @@ func MakeTableRegions( filesRegions := make([]*TableRegion, 0, len(meta.DataFiles)) dataFileSizes := make([]float64, 0, len(meta.DataFiles)) - prevRowIDMax := int64(0) for _, dataFile := range meta.DataFiles { fileRegionsRes := fileRegionsMap[dataFile.FileMeta.Path] var delta int64 diff --git a/pkg/lightning/mydump/region_test.go b/pkg/lightning/mydump/region_test.go index 4c0062721..660be8def 100644 --- a/pkg/lightning/mydump/region_test.go +++ b/pkg/lightning/mydump/region_test.go @@ -66,7 +66,7 @@ func (s *testMydumpRegionSuite) TestTableRegion(c *C) { ioWorkers := worker.NewPool(context.Background(), 1, "io") for _, meta := range dbMeta.Tables { - regions, err := MakeTableRegions(context.Background(), meta, 1, cfg, ioWorkers, loader.GetStore()) + regions, err := MakeTableRegions(context.Background(), meta, 1, cfg, ioWorkers, loader.GetStore(), 0) c.Assert(err, IsNil) // check - region-size vs file-size diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 3cec18729..de4fc6223 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -1209,7 +1209,50 @@ func (t *TableRestore) restoreTable( zap.Int("filesCnt", cp.CountChunks()), ) } else if cp.Status < CheckpointStatusAllWritten { - if err := t.populateChunks(ctx, rc, cp); err != nil { + var maxRowID int64 + versionStr, err := rc.tidbGlue.GetSQLExecutor().ObtainStringWithLog( + ctx, "SELECT version()", "fetch tidb version", log.L()) + if err != nil { + return false, errors.Trace(err) + } + version, err := common.ExtractTiDBVersion(versionStr) + if err != nil { + return false, errors.Trace(err) + } + // "show table next_row_id" is only available after v4.0.0 + if version.Major >= 4 && rc.cfg.TikvImporter.Backend != config.BackendTiDB && + (common.TableHasAutoRowID(t.tableInfo.Core) || t.tableInfo.Core.GetAutoIncrementColInfo() != nil) { + // TODO: GetDB is not available in lightning in SQL + db, _ := rc.tidbGlue.GetDB() + autoIDInfos, err := kv.FetchTableAutoIDInfos(db, t.tableName) + if err != nil { + return false, errors.Trace(err) + } + if len(autoIDInfos) == 1 { + maxRowID = autoIDInfos[0].NextID - 1 + } else if len(autoIDInfos) == 0 { + return false, errors.New("can't fetch previous auto id base") + } else { + return false, errors.New("not supported: more than one auto id allocator found") + } + // maxRowID > 0 means table is likely contains data, so need to fetch current checksum value. + if maxRowID > 0 { + baseChecksum, err := DoChecksum(ctx, t.tableInfo) + if err != nil { + return false, errors.Trace(err) + } + cp.Checksum = verify.MakeKVChecksum(baseChecksum.TotalBytes, baseChecksum.TotalKVs, baseChecksum.Checksum) + rc.saveCpCh <- saveCp{ + tableName: t.tableName, + merger: &TableChecksumMerger{ + Checksum: cp.Checksum, + }, + } + t.logger.Info("checksum before restore table", zap.Object("checksum", &cp.Checksum)) + } + + } + if err := t.populateChunks(ctx, rc, cp, maxRowID); err != nil { return false, errors.Trace(err) } if err := rc.checkpointsDB.InsertEngineCheckpoints(ctx, t.tableName, cp.Engines); err != nil { @@ -1652,6 +1695,10 @@ func (t *TableRestore) postProcess( } } t.logger.Info("local checksum", zap.Object("checksum", &localChecksum)) + if cp.Checksum.SumKVS() > 0 { + localChecksum.Add(&cp.Checksum) + t.logger.Info("merged local checksum", zap.Object("checksum", &localChecksum)) + } err := t.compareChecksum(ctx, localChecksum) // with post restore level 'optional', we will skip checksum error if rc.cfg.PostRestore.Checksum == config.OpLevelOptional { @@ -2001,9 +2048,9 @@ func (tr *TableRestore) Close() { tr.logger.Info("restore done") } -func (t *TableRestore) populateChunks(ctx context.Context, rc *RestoreController, cp *TableCheckpoint) error { +func (t *TableRestore) populateChunks(ctx context.Context, rc *RestoreController, cp *TableCheckpoint, rowIDBase int64) error { task := t.logger.Begin(zap.InfoLevel, "load engines and files") - chunks, err := mydump.MakeTableRegions(ctx, t.tableMeta, len(t.tableInfo.Core.Columns), rc.cfg, rc.ioWorkers, rc.store) + chunks, err := mydump.MakeTableRegions(ctx, t.tableMeta, len(t.tableInfo.Core.Columns), rc.cfg, rc.ioWorkers, rc.store, rowIDBase) if err == nil { timestamp := time.Now().Unix() failpoint.Inject("PopulateChunkTimestamp", func(v failpoint.Value) { diff --git a/pkg/lightning/restore/restore_test.go b/pkg/lightning/restore/restore_test.go index c32760827..613dbb96c 100644 --- a/pkg/lightning/restore/restore_test.go +++ b/pkg/lightning/restore/restore_test.go @@ -298,7 +298,7 @@ func (s *tableRestoreSuite) TestPopulateChunks(c *C) { } rc := &RestoreController{cfg: s.cfg, ioWorkers: worker.NewPool(context.Background(), 1, "io"), store: s.store} - err := s.tr.populateChunks(context.Background(), rc, cp) + err := s.tr.populateChunks(context.Background(), rc, cp, 0) c.Assert(err, IsNil) c.Assert(cp.Engines, DeepEquals, map[int32]*EngineCheckpoint{ -1: { @@ -403,7 +403,7 @@ func (s *tableRestoreSuite) TestPopulateChunks(c *C) { s.cfg.Mydumper.StrictFormat = true regionSize := s.cfg.Mydumper.MaxRegionSize s.cfg.Mydumper.MaxRegionSize = 5 - err = s.tr.populateChunks(context.Background(), rc, cp) + err = s.tr.populateChunks(context.Background(), rc, cp, 0) c.Assert(err, NotNil) c.Assert(err, ErrorMatches, `.*unknown columns in header \[1 2 3\]`) s.cfg.Mydumper.MaxRegionSize = regionSize @@ -465,7 +465,7 @@ func (s *tableRestoreSuite) TestPopulateChunksCSVHeader(c *C) { tr, err := NewTableRestore("`db`.`table`", tableMeta, s.dbInfo, s.tableInfo, &TableCheckpoint{}) c.Assert(err, IsNil) - c.Assert(tr.populateChunks(context.Background(), rc, cp), IsNil) + c.Assert(tr.populateChunks(context.Background(), rc, cp, 0), IsNil) c.Assert(cp.Engines, DeepEquals, map[int32]*EngineCheckpoint{ -1: { diff --git a/tests/lightning_checkpoint/run.sh b/tests/lightning_checkpoint/run.sh index c1e23a541..9943cf091 100755 --- a/tests/lightning_checkpoint/run.sh +++ b/tests/lightning_checkpoint/run.sh @@ -108,7 +108,7 @@ for BACKEND in importer tidb local; do run_lightning -d "$DBPATH" --backend $BACKEND --enable-checkpoint=1 run_sql "$PARTIAL_IMPORT_QUERY" check_contains "s: $(( (1000 * $CHUNK_COUNT + 1001) * $CHUNK_COUNT * $TABLE_COUNT ))" - run_sql 'SELECT count(*) FROM `tidb_lightning_checkpoint_test_cppk.1357924680.bak`.table_v6 WHERE status >= 200' + run_sql 'SELECT count(*) FROM `tidb_lightning_checkpoint_test_cppk.1357924680.bak`.table_v7 WHERE status >= 200' check_contains "count(*): $TABLE_COUNT" # Ensure there is no dangling open engines diff --git a/tests/lightning_checkpoint_chunks/run.sh b/tests/lightning_checkpoint_chunks/run.sh index f7b7cb92e..d06adfd9b 100755 --- a/tests/lightning_checkpoint_chunks/run.sh +++ b/tests/lightning_checkpoint_chunks/run.sh @@ -32,7 +32,7 @@ verify_checkpoint_noop() { run_sql 'SELECT count(i), sum(i) FROM cpch_tsr.tbl;' check_contains "count(i): $(($ROW_COUNT*$CHUNK_COUNT))" check_contains "sum(i): $(( $ROW_COUNT*$CHUNK_COUNT*(($CHUNK_COUNT+2)*$ROW_COUNT + 1)/2 ))" - run_sql 'SELECT count(*) FROM `tidb_lightning_checkpoint_test_cpch.1234567890.bak`.table_v6 WHERE status >= 200' + run_sql 'SELECT count(*) FROM `tidb_lightning_checkpoint_test_cpch.1234567890.bak`.table_v7 WHERE status >= 200' check_contains "count(*): 1" } From 5fdbb50d1681832fe642336d7ac92a1377aaab1d Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 3 Mar 2021 15:08:18 +0800 Subject: [PATCH 02/32] add integration test --- tests/lightning_incremental/config.toml | 0 .../data/incr-schema-create.sql | 1 + .../data/incr.auto_random-schema.sql | 5 ++ .../data/incr.auto_random.sql | 5 ++ .../data/incr.pk_auto_inc-schema.sql | 4 + .../data/incr.pk_auto_inc.sql | 5 ++ .../data/incr.rowid_uk_inc-schema.sql | 4 + .../data/incr.rowid_uk_inc.sql | 5 ++ .../data/incr.uk_auto_inc-schema.sql | 4 + .../data/incr.uk_auto_inc.sql | 5 ++ .../data1/incr-schema-create.sql | 1 + .../data1/incr.auto_random-schema.sql | 5 ++ .../data1/incr.auto_random.sql | 5 ++ .../data1/incr.pk_auto_inc-schema.sql | 4 + .../data1/incr.pk_auto_inc.sql | 5 ++ .../data1/incr.rowid_uk_inc-schema.sql | 4 + .../data1/incr.rowid_uk_inc.sql | 5 ++ .../data1/incr.uk_auto_inc-schema.sql | 4 + .../data1/incr.uk_auto_inc.sql | 5 ++ tests/lightning_incremental/run.sh | 81 +++++++++++++++++++ 20 files changed, 157 insertions(+) create mode 100644 tests/lightning_incremental/config.toml create mode 100644 tests/lightning_incremental/data/incr-schema-create.sql create mode 100644 tests/lightning_incremental/data/incr.auto_random-schema.sql create mode 100644 tests/lightning_incremental/data/incr.auto_random.sql create mode 100644 tests/lightning_incremental/data/incr.pk_auto_inc-schema.sql create mode 100644 tests/lightning_incremental/data/incr.pk_auto_inc.sql create mode 100644 tests/lightning_incremental/data/incr.rowid_uk_inc-schema.sql create mode 100644 tests/lightning_incremental/data/incr.rowid_uk_inc.sql create mode 100644 tests/lightning_incremental/data/incr.uk_auto_inc-schema.sql create mode 100644 tests/lightning_incremental/data/incr.uk_auto_inc.sql create mode 100644 tests/lightning_incremental/data1/incr-schema-create.sql create mode 100644 tests/lightning_incremental/data1/incr.auto_random-schema.sql create mode 100644 tests/lightning_incremental/data1/incr.auto_random.sql create mode 100644 tests/lightning_incremental/data1/incr.pk_auto_inc-schema.sql create mode 100644 tests/lightning_incremental/data1/incr.pk_auto_inc.sql create mode 100644 tests/lightning_incremental/data1/incr.rowid_uk_inc-schema.sql create mode 100644 tests/lightning_incremental/data1/incr.rowid_uk_inc.sql create mode 100644 tests/lightning_incremental/data1/incr.uk_auto_inc-schema.sql create mode 100644 tests/lightning_incremental/data1/incr.uk_auto_inc.sql create mode 100644 tests/lightning_incremental/run.sh diff --git a/tests/lightning_incremental/config.toml b/tests/lightning_incremental/config.toml new file mode 100644 index 000000000..e69de29bb diff --git a/tests/lightning_incremental/data/incr-schema-create.sql b/tests/lightning_incremental/data/incr-schema-create.sql new file mode 100644 index 000000000..624892540 --- /dev/null +++ b/tests/lightning_incremental/data/incr-schema-create.sql @@ -0,0 +1 @@ +create database `incr`; diff --git a/tests/lightning_incremental/data/incr.auto_random-schema.sql b/tests/lightning_incremental/data/incr.auto_random-schema.sql new file mode 100644 index 000000000..712c45921 --- /dev/null +++ b/tests/lightning_incremental/data/incr.auto_random-schema.sql @@ -0,0 +1,5 @@ +/*!40103 SET TIME_ZONE='+00:00' */; +CREATE TABLE `auto_random` ( + `id` bigint primary key auto_random, + v varchar(255) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data/incr.auto_random.sql b/tests/lightning_incremental/data/incr.auto_random.sql new file mode 100644 index 000000000..d4357822b --- /dev/null +++ b/tests/lightning_incremental/data/incr.auto_random.sql @@ -0,0 +1,5 @@ +/*!40103 SET TIME_ZONE='+00:00' */; +INSERT INTO `auto_random` (`v`) VALUES +("a"), +("b"), +("c"); diff --git a/tests/lightning_incremental/data/incr.pk_auto_inc-schema.sql b/tests/lightning_incremental/data/incr.pk_auto_inc-schema.sql new file mode 100644 index 000000000..52e876978 --- /dev/null +++ b/tests/lightning_incremental/data/incr.pk_auto_inc-schema.sql @@ -0,0 +1,4 @@ +CREATE TABLE `auto_random` ( + `id` bigint PRIMARY KEY AUTO_INCREMENT, + v varchar(255) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data/incr.pk_auto_inc.sql b/tests/lightning_incremental/data/incr.pk_auto_inc.sql new file mode 100644 index 000000000..ac85444a5 --- /dev/null +++ b/tests/lightning_incremental/data/incr.pk_auto_inc.sql @@ -0,0 +1,5 @@ +/*!40103 SET TIME_ZONE='+00:00' */; +INSERT INTO `pk_auto_inc` (`v`) VALUES +("a"), +("b"), +("c"); diff --git a/tests/lightning_incremental/data/incr.rowid_uk_inc-schema.sql b/tests/lightning_incremental/data/incr.rowid_uk_inc-schema.sql new file mode 100644 index 000000000..5beb69550 --- /dev/null +++ b/tests/lightning_incremental/data/incr.rowid_uk_inc-schema.sql @@ -0,0 +1,4 @@ +CREATE TABLE `rowid_uk_inc` ( + `id` bigint UNIQUE KEY AUTO_INCREMENT, + s varchar(16), +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data/incr.rowid_uk_inc.sql b/tests/lightning_incremental/data/incr.rowid_uk_inc.sql new file mode 100644 index 000000000..91831a9ff --- /dev/null +++ b/tests/lightning_incremental/data/incr.rowid_uk_inc.sql @@ -0,0 +1,5 @@ +/*!40103 SET TIME_ZONE='+00:00' */; +INSERT INTO `rowid_uk_inc` (`id`) VALUES +('a'), +('b'), +('c'); diff --git a/tests/lightning_incremental/data/incr.uk_auto_inc-schema.sql b/tests/lightning_incremental/data/incr.uk_auto_inc-schema.sql new file mode 100644 index 000000000..d38886b60 --- /dev/null +++ b/tests/lightning_incremental/data/incr.uk_auto_inc-schema.sql @@ -0,0 +1,4 @@ +CREATE TABLE `uk_auto_inc` ( + `u` int PRIMARY KEY, + `id` bigint UNIQUE KEY AUTO_INCREMENT, +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data/incr.uk_auto_inc.sql b/tests/lightning_incremental/data/incr.uk_auto_inc.sql new file mode 100644 index 000000000..3384ed652 --- /dev/null +++ b/tests/lightning_incremental/data/incr.uk_auto_inc.sql @@ -0,0 +1,5 @@ +/*!40103 SET TIME_ZONE='+00:00' */; +INSERT INTO `uk_auto_inc` (`u`) VALUES +(1), +(2), +(3); diff --git a/tests/lightning_incremental/data1/incr-schema-create.sql b/tests/lightning_incremental/data1/incr-schema-create.sql new file mode 100644 index 000000000..624892540 --- /dev/null +++ b/tests/lightning_incremental/data1/incr-schema-create.sql @@ -0,0 +1 @@ +create database `incr`; diff --git a/tests/lightning_incremental/data1/incr.auto_random-schema.sql b/tests/lightning_incremental/data1/incr.auto_random-schema.sql new file mode 100644 index 000000000..712c45921 --- /dev/null +++ b/tests/lightning_incremental/data1/incr.auto_random-schema.sql @@ -0,0 +1,5 @@ +/*!40103 SET TIME_ZONE='+00:00' */; +CREATE TABLE `auto_random` ( + `id` bigint primary key auto_random, + v varchar(255) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data1/incr.auto_random.sql b/tests/lightning_incremental/data1/incr.auto_random.sql new file mode 100644 index 000000000..7e89d09b5 --- /dev/null +++ b/tests/lightning_incremental/data1/incr.auto_random.sql @@ -0,0 +1,5 @@ +/*!40103 SET TIME_ZONE='+00:00' */; +INSERT INTO `auto_random` (`v`) VALUES +("d"), +("e"), +("f"); diff --git a/tests/lightning_incremental/data1/incr.pk_auto_inc-schema.sql b/tests/lightning_incremental/data1/incr.pk_auto_inc-schema.sql new file mode 100644 index 000000000..52e876978 --- /dev/null +++ b/tests/lightning_incremental/data1/incr.pk_auto_inc-schema.sql @@ -0,0 +1,4 @@ +CREATE TABLE `auto_random` ( + `id` bigint PRIMARY KEY AUTO_INCREMENT, + v varchar(255) +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data1/incr.pk_auto_inc.sql b/tests/lightning_incremental/data1/incr.pk_auto_inc.sql new file mode 100644 index 000000000..5a0ab087d --- /dev/null +++ b/tests/lightning_incremental/data1/incr.pk_auto_inc.sql @@ -0,0 +1,5 @@ +/*!40103 SET TIME_ZONE='+00:00' */; +INSERT INTO `pk_auto_inc` (`v`) VALUES +("d"), +("e"), +("f"); diff --git a/tests/lightning_incremental/data1/incr.rowid_uk_inc-schema.sql b/tests/lightning_incremental/data1/incr.rowid_uk_inc-schema.sql new file mode 100644 index 000000000..cf5bb71fc --- /dev/null +++ b/tests/lightning_incremental/data1/incr.rowid_uk_inc-schema.sql @@ -0,0 +1,4 @@ +CREATE TABLE `uk_auto_inc` ( + `id` bigint UNIQUE KEY AUTO_INCREMENT, + s varchar(16), +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data1/incr.rowid_uk_inc.sql b/tests/lightning_incremental/data1/incr.rowid_uk_inc.sql new file mode 100644 index 000000000..5938fc484 --- /dev/null +++ b/tests/lightning_incremental/data1/incr.rowid_uk_inc.sql @@ -0,0 +1,5 @@ +/*!40103 SET TIME_ZONE='+00:00' */; +INSERT INTO `rowid_uk_inc` (`s`) VALUES +("d"), +("e"), +("f"); diff --git a/tests/lightning_incremental/data1/incr.uk_auto_inc-schema.sql b/tests/lightning_incremental/data1/incr.uk_auto_inc-schema.sql new file mode 100644 index 000000000..d38886b60 --- /dev/null +++ b/tests/lightning_incremental/data1/incr.uk_auto_inc-schema.sql @@ -0,0 +1,4 @@ +CREATE TABLE `uk_auto_inc` ( + `u` int PRIMARY KEY, + `id` bigint UNIQUE KEY AUTO_INCREMENT, +) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data1/incr.uk_auto_inc.sql b/tests/lightning_incremental/data1/incr.uk_auto_inc.sql new file mode 100644 index 000000000..ece660062 --- /dev/null +++ b/tests/lightning_incremental/data1/incr.uk_auto_inc.sql @@ -0,0 +1,5 @@ +/*!40103 SET TIME_ZONE='+00:00' */; +INSERT INTO `pk_auto_inc` (`id`) VALUES +(4), +(5), +(6); diff --git a/tests/lightning_incremental/run.sh b/tests/lightning_incremental/run.sh new file mode 100644 index 000000000..30c1ab58a --- /dev/null +++ b/tests/lightning_incremental/run.sh @@ -0,0 +1,81 @@ +#!/bin/sh +# +# Copyright 2020 PingCAP, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eu + +# FIXME: auto-random is only stable on master currently. +check_cluster_version 4 0 0 AUTO_RANDOM || exit 0 + +DB_NAME=incr + +for backend in importer local; do + if [ "$backend" = 'local' ]; then + check_cluster_version 4 0 0 'local backend' || continue + fi + + run_sql "DROP DATABASE IF EXISTS incr;" + run_lightning --backend $backend + + for tbl in auto_random pk_auto_inc rowid_uk_inc uk_auto_inc; do + run_sql "SELECT count(*) from incr.$tbl" + check_contains "count(*): 3" + done + + for tbl in auto_random pk_auto_inc rowid_uk_inc uk_auto_inc; do + if [ "$tbl" = "auto_random" ]; then + run_sql "SELECT id & b'000001111111111111111111111111111111111111111111111111111111111' as inc FROM incr.$tbl" + else + run_sql "SELECT id as inc FROM incr.$tbl" + fi + check_contains 'inc: 1' + check_contains 'inc: 2' + check_contains 'inc: 3' + done + + for tbl in pk_auto_inc rowid_uk_inc; do + run_sql "SELECT group_concat(v) from incr.$tbl group by 'all';" + check_contains "group_concat(v): a,b,c" + done + + run_sql "SELECT sum(u) from incr.uk_auto_inc;" + check_contains "sum(u): 6" + + # incrementally import all data in data1 + run_lightning --backend $backend -d "tests/$TEST_NAME/data1" + + for tbl in auto_random pk_auto_inc rowid_uk_inc uk_auto_inc; do + run_sql "SELECT count(*) from incr.$tbl" + check_contains "count(*): 6" + done + + for tbl in auto_random pk_auto_inc rowid_uk_inc uk_auto_inc; do + if [ "$tbl" = "auto_random" ]; then + run_sql "SELECT id & b'000001111111111111111111111111111111111111111111111111111111111' as inc FROM incr.$tbl" + else + run_sql "SELECT id as inc FROM incr.$tbl" + fi + check_contains 'inc: 4' + check_contains 'inc: 5' + check_contains 'inc: 6' + done + + for tbl in pk_auto_inc rowid_uk_inc; do + run_sql "SELECT group_concat(v) from incr.$tbl group by 'all';" + check_contains "group_concat(v): a,b,c,d,e,f" + done + + run_sql "SELECT sum(u) from incr.uk_auto_inc;" + check_contains "sum(u): 21" +done From df174b6b422abda73284612b944f15419b4cb55e Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 9 Mar 2021 11:02:25 +0800 Subject: [PATCH 03/32] allow run multi lightning in parallel --- pkg/lightning/backend/local.go | 1 - pkg/lightning/backend/tidb.go | 11 +- pkg/lightning/glue/glue.go | 9 + pkg/lightning/mydump/region.go | 2 +- pkg/lightning/mydump/region_test.go | 2 +- pkg/lightning/restore/restore.go | 426 ++++++++++++++++++++++++-- pkg/lightning/restore/restore_test.go | 6 +- 7 files changed, 414 insertions(+), 43 deletions(-) diff --git a/pkg/lightning/backend/local.go b/pkg/lightning/backend/local.go index 84a2d3872..b72013ac7 100644 --- a/pkg/lightning/backend/local.go +++ b/pkg/lightning/backend/local.go @@ -1878,7 +1878,6 @@ func (w *LocalWriter) writeKVsOrIngest(desc localIngestDescription) error { return err } } - // if write failed only because of unorderedness, we immediately ingest the memcache. immWriter, err := newSSTWriter(w.genSSTPath()) if err != nil { diff --git a/pkg/lightning/backend/tidb.go b/pkg/lightning/backend/tidb.go index b575a9c95..e95602684 100644 --- a/pkg/lightning/backend/tidb.go +++ b/pkg/lightning/backend/tidb.go @@ -34,6 +34,7 @@ import ( "github.com/pingcap/br/pkg/lightning/common" "github.com/pingcap/br/pkg/lightning/config" + "github.com/pingcap/br/pkg/lightning/glue" "github.com/pingcap/br/pkg/lightning/log" "github.com/pingcap/br/pkg/lightning/verification" ) @@ -494,7 +495,7 @@ func (be *tidbBackend) FetchRemoteTableModels(ctx context.Context, schemaName st // init auto id column for each table for _, tbl := range tables { tblName := common.UniqueTable(schemaName, tbl.Name.O) - autoIDInfos, err := FetchTableAutoIDInfos(tx, tblName) + autoIDInfos, err := FetchTableAutoIDInfos(ctx, tx, tblName) if err != nil { return errors.Trace(err) } @@ -552,18 +553,14 @@ func (w *TiDBWriter) AppendRows(ctx context.Context, tableName string, columnNam return w.be.WriteRows(ctx, w.engineUUID, tableName, columnNames, arg1, rows) } -type QueryExecutor interface { - Query(query string, args ...interface{}) (*sql.Rows, error) -} - type TableAutoIDInfo struct { Column string NextID int64 Type string } -func FetchTableAutoIDInfos(exec QueryExecutor, tableName string) ([]*TableAutoIDInfo, error) { - rows, e := exec.Query(fmt.Sprintf("SHOW TABLE %s NEXT_ROW_ID", tableName)) +func FetchTableAutoIDInfos(ctx context.Context, exec glue.QueryExecutor, tableName string) ([]*TableAutoIDInfo, error) { + rows, e := exec.QueryContext(ctx, fmt.Sprintf("SHOW TABLE %s NEXT_ROW_ID", tableName)) if e != nil { return nil, errors.Trace(e) } diff --git a/pkg/lightning/glue/glue.go b/pkg/lightning/glue/glue.go index cf3a17f5d..592d0c8a7 100644 --- a/pkg/lightning/glue/glue.go +++ b/pkg/lightning/glue/glue.go @@ -189,3 +189,12 @@ const ( RecordEstimatedChunk = "EstimatedChunk" RecordFinishedChunk = "FinishedChunk" ) + +type QueryExecutor interface { + QueryContext(ctx context.Context, query string, args ...interface{}) (*sql.Rows, error) +} + +type DBExecutor interface { + QueryExecutor + BeginTx(ctx context.Context, opts *sql.TxOptions) (*sql.Tx, error) +} diff --git a/pkg/lightning/mydump/region.go b/pkg/lightning/mydump/region.go index 82033eb61..9d7f1fb51 100644 --- a/pkg/lightning/mydump/region.go +++ b/pkg/lightning/mydump/region.go @@ -138,7 +138,6 @@ func MakeTableRegions( cfg *config.Config, ioWorkers *worker.Pool, store storage.ExternalStorage, - prevRowIDMax int64, ) ([]*TableRegion, error) { // Split files into regions type fileRegionRes struct { @@ -211,6 +210,7 @@ func MakeTableRegions( filesRegions := make([]*TableRegion, 0, len(meta.DataFiles)) dataFileSizes := make([]float64, 0, len(meta.DataFiles)) + prevRowIDMax := int64(0) for _, dataFile := range meta.DataFiles { fileRegionsRes := fileRegionsMap[dataFile.FileMeta.Path] var delta int64 diff --git a/pkg/lightning/mydump/region_test.go b/pkg/lightning/mydump/region_test.go index 660be8def..4c0062721 100644 --- a/pkg/lightning/mydump/region_test.go +++ b/pkg/lightning/mydump/region_test.go @@ -66,7 +66,7 @@ func (s *testMydumpRegionSuite) TestTableRegion(c *C) { ioWorkers := worker.NewPool(context.Background(), 1, "io") for _, meta := range dbMeta.Tables { - regions, err := MakeTableRegions(context.Background(), meta, 1, cfg, ioWorkers, loader.GetStore(), 0) + regions, err := MakeTableRegions(context.Background(), meta, 1, cfg, ioWorkers, loader.GetStore()) c.Assert(err, IsNil) // check - region-size vs file-size diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index de4fc6223..0b495dd80 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -15,6 +15,7 @@ package restore import ( "context" + "database/sql" "fmt" "io" "math" @@ -70,6 +71,25 @@ const ( compactStateDoing ) +const ( + // CreateBRIESubJobTable stores the per-table sub jobs information used by TiDB Lightning + CreateBRIESubJobTable = `CREATE TABLE IF NOT EXISTS mysql.brie_sub_tasks ( + id BIGINT(20) UNSIGNED, + table_id BIGINT(64) NOT NULL, + table_name VARCHAR(64) NOT NULL, + row_id_base BIGINT(20) NOT NULL DEFAULT 0, + row_id_max BIGINT(20) NOT NULL DEFAULT 0, + base_total_kvs BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + base_total_bytes BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + base_checksum BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + total_kvs BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + total_bytes BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + checksum BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + status VARCHAR(32) NOT NULL, + PRIMARY KEY (table_id, id) + );` +) + // DeliverPauser is a shared pauser to pause progress to (*chunkRestore).encodeLoop var DeliverPauser = common.NewPauser() @@ -550,6 +570,19 @@ func (worker *restoreSchemaWorker) appendJob(job *schemaJob) error { } func (rc *RestoreController) restoreSchema(ctx context.Context) error { + executor := rc.tidbGlue.GetSQLExecutor() + // set pessimistic transation mode + if err := executor.ExecuteWithLog(ctx, "SET GLOBAL tidb_txn_mode = 'pessimistic';", "switch to pessimistic mode", + log.L()); err != nil { + return errors.Annotate(err, "switch txn mode failed") + } + + // TODO: maybe we should not create this table here since user may not have write permission to the `mysql` db. + // ensure meta table exists + if err := executor.ExecuteWithLog(ctx, CreateBRIESubJobTable, "create meta table", log.L()); err != nil { + return errors.Annotate(err, "create meta table failed") + } + if !rc.cfg.Mydumper.NoSchema { logTask := log.L().Begin(zap.InfoLevel, "restore all schema") concurrency := utils.MinInt(rc.cfg.App.RegionConcurrency, 8) @@ -1175,8 +1208,23 @@ func (rc *RestoreController) restoreTables(ctx context.Context) error { wg.Add(1) go func() { for task := range postProcessTaskChan { + // TODO: support Lightning via SQL + db, err := rc.tidbGlue.GetDB() + if err != nil { + restoreErr.Set(err) + continue + } + + metaMgr := &tableMetaMgr{ + session: common.SQLWithRetry{ + DB: db, + Logger: task.tr.logger, + }, + taskID: rc.cfg.TaskID, + tr: task.tr, + } // force all the remain post-process tasks to be executed - _, err := task.tr.postProcess(ctx2, rc, task.cp, true) + _, err = task.tr.postProcess(ctx2, rc, task.cp, true, metaMgr) restoreErr.Set(err) } wg.Done() @@ -1202,6 +1250,21 @@ func (t *TableRestore) restoreTable( default: } + // TODO: support Lightning via SQL + db, err := rc.tidbGlue.GetDB() + if err != nil { + return false, errors.Trace(err) + } + + metaMgr := &tableMetaMgr{ + session: common.SQLWithRetry{ + DB: db, + Logger: t.logger, + }, + taskID: rc.cfg.TaskID, + tr: t, + } + // no need to do anything if the chunks are already populated if len(cp.Engines) > 0 { t.logger.Info("reusing engines and files info from checkpoint", @@ -1209,39 +1272,45 @@ func (t *TableRestore) restoreTable( zap.Int("filesCnt", cp.CountChunks()), ) } else if cp.Status < CheckpointStatusAllWritten { - var maxRowID int64 versionStr, err := rc.tidbGlue.GetSQLExecutor().ObtainStringWithLog( ctx, "SELECT version()", "fetch tidb version", log.L()) if err != nil { return false, errors.Trace(err) } + version, err := common.ExtractTiDBVersion(versionStr) if err != nil { return false, errors.Trace(err) } + + if err := t.populateChunks(ctx, rc, cp); err != nil { + return false, errors.Trace(err) + } + + // fetch the max chunk row_id max value as the global max row_id + rowIDMax := int64(0) + for _, engine := range cp.Engines { + if len(engine.Chunks) > 0 && engine.Chunks[len(engine.Chunks)-1].Chunk.RowIDMax > rowIDMax { + rowIDMax = engine.Chunks[len(engine.Chunks)-1].Chunk.RowIDMax + } + } + // "show table next_row_id" is only available after v4.0.0 if version.Major >= 4 && rc.cfg.TikvImporter.Backend != config.BackendTiDB && (common.TableHasAutoRowID(t.tableInfo.Core) || t.tableInfo.Core.GetAutoIncrementColInfo() != nil) { - // TODO: GetDB is not available in lightning in SQL - db, _ := rc.tidbGlue.GetDB() - autoIDInfos, err := kv.FetchTableAutoIDInfos(db, t.tableName) - if err != nil { - return false, errors.Trace(err) + // first, insert a new-line into meta table + if err = metaMgr.InitTableMeta(ctx); err != nil { + return false, err } - if len(autoIDInfos) == 1 { - maxRowID = autoIDInfos[0].NextID - 1 - } else if len(autoIDInfos) == 0 { - return false, errors.New("can't fetch previous auto id base") - } else { - return false, errors.New("not supported: more than one auto id allocator found") + + checksum, rowIDBase, err := metaMgr.AllocTableRowIDs(ctx, t, rowIDMax) + if err != nil { + return false, err } - // maxRowID > 0 means table is likely contains data, so need to fetch current checksum value. - if maxRowID > 0 { - baseChecksum, err := DoChecksum(ctx, t.tableInfo) - if err != nil { - return false, errors.Trace(err) - } - cp.Checksum = verify.MakeKVChecksum(baseChecksum.TotalBytes, baseChecksum.TotalKVs, baseChecksum.Checksum) + t.RebaseChunkRowIDs(cp, rowIDBase) + + if checksum != nil { + cp.Checksum = *checksum rc.saveCpCh <- saveCp{ tableName: t.tableName, merger: &TableChecksumMerger{ @@ -1250,10 +1319,6 @@ func (t *TableRestore) restoreTable( } t.logger.Info("checksum before restore table", zap.Object("checksum", &cp.Checksum)) } - - } - if err := t.populateChunks(ctx, rc, cp, maxRowID); err != nil { - return false, errors.Trace(err) } if err := rc.checkpointsDB.InsertEngineCheckpoints(ctx, t.tableName, cp.Engines); err != nil { return false, errors.Trace(err) @@ -1277,13 +1342,18 @@ func (t *TableRestore) restoreTable( } // 2. Restore engines (if still needed) - err := t.restoreEngines(ctx, rc, cp) + err = t.restoreEngines(ctx, rc, cp) + if err != nil { + return false, errors.Trace(err) + } + + err = metaMgr.updateTableStatus(ctx, metaStatusRestoreFinished) if err != nil { return false, errors.Trace(err) } // 3. Post-process. With the last parameter set to false, we can allow delay analyze execute latter - return t.postProcess(ctx, rc, cp, false /* force-analyze */) + return t.postProcess(ctx, rc, cp, false /* force-analyze */, metaMgr) } func (t *TableRestore) restoreEngines(ctx context.Context, rc *RestoreController, cp *TableCheckpoint) error { @@ -1643,6 +1713,7 @@ func (t *TableRestore) postProcess( rc *RestoreController, cp *TableCheckpoint, forcePostProcess bool, + metaMgr *tableMetaMgr, ) (bool, error) { // there are no data in this table, no need to do post process // this is important for tables that are just the dump table of views @@ -1695,11 +1766,23 @@ func (t *TableRestore) postProcess( } } t.logger.Info("local checksum", zap.Object("checksum", &localChecksum)) - if cp.Checksum.SumKVS() > 0 { + + needChecksum, baseTotalChecksum, err := metaMgr.checkAndUpdateLocalChecksum(ctx, &localChecksum) + if err != nil { + return false, err + } + + if !needChecksum { + return false, nil + } + + if cp.Checksum.SumKVS() > 0 || baseTotalChecksum.SumKVS() > 0 { localChecksum.Add(&cp.Checksum) + localChecksum.Add(baseTotalChecksum) t.logger.Info("merged local checksum", zap.Object("checksum", &localChecksum)) } - err := t.compareChecksum(ctx, localChecksum) + + err = t.compareChecksum(ctx, localChecksum) // with post restore level 'optional', we will skip checksum error if rc.cfg.PostRestore.Checksum == config.OpLevelOptional { if err != nil { @@ -1707,10 +1790,15 @@ func (t *TableRestore) postProcess( err = nil } } + if err == nil { + err = metaMgr.FinishTable(ctx) + } + rc.saveStatusCheckpoint(t.tableName, WholeTableEngineID, err, CheckpointStatusChecksummed) if err != nil { return false, errors.Trace(err) } + cp.Status = CheckpointStatusChecksummed } else { finished = false @@ -1910,6 +1998,7 @@ func (rc *RestoreController) setGlobalVariables(ctx context.Context) error { // we should enable/disable new collation here since in server mode, tidb config // may be different in different tasks collate.SetNewCollationEnabledForTest(enabled) + return nil } @@ -2048,9 +2137,9 @@ func (tr *TableRestore) Close() { tr.logger.Info("restore done") } -func (t *TableRestore) populateChunks(ctx context.Context, rc *RestoreController, cp *TableCheckpoint, rowIDBase int64) error { +func (t *TableRestore) populateChunks(ctx context.Context, rc *RestoreController, cp *TableCheckpoint) error { task := t.logger.Begin(zap.InfoLevel, "load engines and files") - chunks, err := mydump.MakeTableRegions(ctx, t.tableMeta, len(t.tableInfo.Core.Columns), rc.cfg, rc.ioWorkers, rc.store, rowIDBase) + chunks, err := mydump.MakeTableRegions(ctx, t.tableMeta, len(t.tableInfo.Core.Columns), rc.cfg, rc.ioWorkers, rc.store) if err == nil { timestamp := time.Now().Unix() failpoint.Inject("PopulateChunkTimestamp", func(v failpoint.Value) { @@ -2094,6 +2183,18 @@ func (t *TableRestore) populateChunks(ctx context.Context, rc *RestoreController return err } +func (t *TableRestore) RebaseChunkRowIDs(cp *TableCheckpoint, rowIDBase int64) { + if rowIDBase == 0 { + return + } + for _, engine := range cp.Engines { + for _, chunk := range engine.Chunks { + chunk.Chunk.PrevRowIDMax += rowIDBase + chunk.Chunk.RowIDMax += rowIDBase + } + } +} + // initializeColumns computes the "column permutation" for an INSERT INTO // statement. Suppose a table has columns (a, b, c, d) in canonical order, and // we execute `INSERT INTO (d, b, a) VALUES ...`, we will need to remap the @@ -2582,3 +2683,268 @@ func (cr *chunkRestore) restore( return ctx.Err() } } + +type tableMetaMgr struct { + session common.SQLWithRetry + taskID int64 + tr *TableRestore +} + +func (m *tableMetaMgr) InitTableMeta(ctx context.Context) error { + // avoid override existing metadata if the meta is already inserted. + stmt := `INSERT IGNORE INTO mysql.brie_sub_tasks (task_id, table_id, table_name, status) values (?, ?, ?, ?)` + task := m.tr.logger.Begin(zap.DebugLevel, "init table meta") + err := m.session.Exec(ctx, "init table meta", stmt, m.taskID, m.tr.tableInfo.ID, m.tr.tableName, metaStatusInitial.String()) + task.End(zap.ErrorLevel, err) + return errors.Trace(err) +} + +type metaStatus uint32 + +const ( + metaStatusInitial metaStatus = iota + metaStatusRowIDAllocated + metaStatusRestoreStarted + metaStatusRestoreFinished + metaStatusChecksuming + metaStatusChecksumSkipped + metaStatusFinished +) + +func (m metaStatus) String() string { + switch m { + case metaStatusInitial: + return "initialized" + case metaStatusRowIDAllocated: + return "allocated" + case metaStatusRestoreStarted: + return "restore" + case metaStatusRestoreFinished: + return "restore_finished" + case metaStatusChecksuming: + return "checksuming" + case metaStatusChecksumSkipped: + return "checksum_skipped" + case metaStatusFinished: + return "finish" + default: + panic(fmt.Sprintf("unexpected metaStatus value '%d'", m)) + } +} + +func parseMetaStatus(s string) (metaStatus, error) { + switch s { + case "", "initialized": + return metaStatusInitial, nil + case "allocated": + return metaStatusRowIDAllocated, nil + case "restore": + return metaStatusRestoreStarted, nil + case "restore_finished": + return metaStatusRestoreFinished, nil + case "checksuming": + return metaStatusChecksuming, nil + case "finish": + return metaStatusFinished, nil + default: + return metaStatusInitial, errors.Errorf("invalid meta status '%s'", s) + } +} + +func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, rawRowIDMax int64) (*verify.KVChecksum, int64, error) { + var newRowIDBase, newRowIDMax int64 + curStatus := metaStatusInitial + newStatus := metaStatusRowIDAllocated + var baseTotalKvs, baseTotalBytes, baseChecksum uint64 + err := m.session.Transact(ctx, "init table allocator base", func(ctx context.Context, tx *sql.Tx) error { + query := fmt.Sprintf("SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from mysql.brie_sub_tasks WHERE table_id = ?") + rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) + if err != nil { + return errors.Trace(err) + } + var ( + metaLightningID, rowIDBase, rowIDMax, maxRowIDMax int64 + statusValue string + ) + for rows.Next() { + if err = rows.Scan(&metaLightningID, &rowIDBase, &rowIDMax, &baseTotalKvs, &baseTotalBytes, &baseChecksum, &statusValue); err != nil { + return errors.Trace(err) + } + status, err := parseMetaStatus(statusValue) + if err != nil { + return errors.Annotatef(err, "invalid meta status '%s'", statusValue) + } + + // skip finished meta + if status >= metaStatusFinished { + continue + } + + if metaLightningID == m.taskID && status >= metaStatusRowIDAllocated { + if rowIDMax-rowIDBase != rawRowIDMax { + return errors.Errorf("verify allocator base failed. local: '%d', meta: '%d'", rawRowIDMax, rowIDMax-rowIDBase) + } + newRowIDBase = rowIDBase + newRowIDMax = rowIDMax + curStatus = status + break + } + + if rowIDMax > maxRowIDMax { + maxRowIDMax = rowIDMax + } + } + + // no enough info are available, fetch row_id max for table + if curStatus == metaStatusInitial { + if maxRowIDMax == 0 { + autoIDInfos, err := kv.FetchTableAutoIDInfos(ctx, tx, m.tr.tableName) + if err != nil { + return errors.Trace(err) + } + if len(autoIDInfos) == 1 { + maxRowIDMax = autoIDInfos[0].NextID - 1 + // if table does not contain data, we can skip do pre-checksum + if maxRowIDMax == 0 { + newStatus = metaStatusRestoreStarted + } + } else if len(autoIDInfos) == 0 { + return errors.New("can't fetch previous auto id base") + } else { + return errors.New("not supported: more than one auto id allocator found") + } + } + newRowIDBase = maxRowIDMax + newRowIDMax = newRowIDBase + rawRowIDMax + query = fmt.Sprintf("update mysql.brie_sub_tasks set row_id_base = %d, row_id_max = %d, status = '%s', where table_id = %d and task_id = %d", + newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, metaLightningID) + _, err = tx.ExecContext(ctx, query) + return errors.Trace(err) + } + return nil + }) + + var checksum *verify.KVChecksum + // need to do checksum and update checksum meta + if newStatus < metaStatusRestoreStarted { + // table contains data but haven't do checksum yet + if newRowIDBase > 0 && baseTotalKvs == 0 { + baseChecksum, err := DoChecksum(ctx, tr.tableInfo) + if err != nil { + return nil, 0, errors.Trace(err) + } + ck := verify.MakeKVChecksum(baseChecksum.TotalBytes, baseChecksum.TotalKVs, baseChecksum.Checksum) + checksum = &ck + } + + if checksum != nil { + if err = m.UpdateTableBaseChecksum(ctx, checksum); err != nil { + return nil, 0, errors.Trace(err) + } + + tr.logger.Info("checksum before restore table", zap.Object("checksum", checksum)) + } + } + return checksum, newRowIDBase, nil +} + +func (m *tableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error { + query := fmt.Sprintf("update mysql.brie_sub_tasks set kv_kvs_base = %d, kv_bytes_base = %d, checksum_base = %d, status = '%s' where table_id = %d and task_id = %d", + checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), m.tr.tableInfo.ID, m.taskID) + + return m.session.Exec(ctx, "update base checksum", query) +} + +func (m *tableMetaMgr) updateTableStatus(ctx context.Context, status metaStatus) error { + query := fmt.Sprintf("update mysql.brie_sub_tasks set status = '%s' where table_id = %d and task_id = %d", + status.String(), m.tr.tableInfo.ID, m.taskID) + + return m.session.Exec(ctx, "update meta status", query) +} + +func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { + + var ( + baseTotalKvs, baseTotalBytes, baseChecksum uint64 + taskKvs, taskBytes, taskChecksum uint64 + totalKvs, totalBytes, totalChecksum uint64 + ) + newStatus := metaStatusChecksuming + needChecksum := true + err := m.session.Transact(ctx, "checksum pre-check", func(ctx context.Context, tx *sql.Tx) error { + query := fmt.Sprintf("SELECT task_id, total_kvs_base, total_bytes_base, checksum_base, total_kvs, total_bytes, checksum, status from mysql.brie_sub_tasks WHERE table_id = ?") + rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) + if err != nil { + return errors.Trace(err) + } + var ( + taskID int64 + statusValue string + ) + for rows.Next() { + if err = rows.Scan(&taskID, &baseTotalKvs, &baseTotalBytes, &baseChecksum, &taskKvs, &taskBytes, &taskChecksum, &statusValue); err != nil { + return errors.Trace(err) + } + status, err := parseMetaStatus(statusValue) + if err != nil { + return errors.Annotatef(err, "invalid meta status '%s'", statusValue) + } + + // skip finished meta + if status >= metaStatusFinished { + continue + } + + if taskID == m.taskID { + if status > metaStatusChecksuming { + newStatus = status + needChecksum = status == metaStatusChecksuming + return nil + } + + continue + } + + if status < metaStatusChecksuming { + newStatus = metaStatusChecksumSkipped + needChecksum = false + break + } else if status == metaStatusChecksuming { + return errors.New("another task is checksuming, there must be something wrong!") + } + + totalBytes += baseTotalBytes + totalKvs += baseTotalKvs + totalChecksum ^= baseChecksum + + totalBytes += taskBytes + totalKvs += taskKvs + totalChecksum ^= taskChecksum + } + + if rows.Err() != nil { + return rows.Err() + } + + query = fmt.Sprintf("update mysql.brie_sub_tasks set total_kvs = %d, total_bytes = %d, checksum = %d where table_id = %d and id = %d", + checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), m.tr.tableInfo.ID, m.taskID) + + _, err = tx.ExecContext(ctx, query) + return errors.Trace(err) + }) + if err != nil { + return false, nil, err + } + + var remoteChecksum *verify.KVChecksum + if needChecksum { + ck := verify.MakeKVChecksum(totalBytes, totalKvs, totalChecksum) + remoteChecksum = &ck + } + return needChecksum, remoteChecksum, nil +} + +func (m *tableMetaMgr) FinishTable(ctx context.Context) error { + return m.session.Exec(ctx, "clean up metas", "DELETE FROM mysql.brie_sub_tasks where table_id = ? and id = ? and (status = 'checksuming' or status = 'checksum_skipped')", + m.tr.tableInfo.ID, m.taskID) +} diff --git a/pkg/lightning/restore/restore_test.go b/pkg/lightning/restore/restore_test.go index 613dbb96c..c32760827 100644 --- a/pkg/lightning/restore/restore_test.go +++ b/pkg/lightning/restore/restore_test.go @@ -298,7 +298,7 @@ func (s *tableRestoreSuite) TestPopulateChunks(c *C) { } rc := &RestoreController{cfg: s.cfg, ioWorkers: worker.NewPool(context.Background(), 1, "io"), store: s.store} - err := s.tr.populateChunks(context.Background(), rc, cp, 0) + err := s.tr.populateChunks(context.Background(), rc, cp) c.Assert(err, IsNil) c.Assert(cp.Engines, DeepEquals, map[int32]*EngineCheckpoint{ -1: { @@ -403,7 +403,7 @@ func (s *tableRestoreSuite) TestPopulateChunks(c *C) { s.cfg.Mydumper.StrictFormat = true regionSize := s.cfg.Mydumper.MaxRegionSize s.cfg.Mydumper.MaxRegionSize = 5 - err = s.tr.populateChunks(context.Background(), rc, cp, 0) + err = s.tr.populateChunks(context.Background(), rc, cp) c.Assert(err, NotNil) c.Assert(err, ErrorMatches, `.*unknown columns in header \[1 2 3\]`) s.cfg.Mydumper.MaxRegionSize = regionSize @@ -465,7 +465,7 @@ func (s *tableRestoreSuite) TestPopulateChunksCSVHeader(c *C) { tr, err := NewTableRestore("`db`.`table`", tableMeta, s.dbInfo, s.tableInfo, &TableCheckpoint{}) c.Assert(err, IsNil) - c.Assert(tr.populateChunks(context.Background(), rc, cp, 0), IsNil) + c.Assert(tr.populateChunks(context.Background(), rc, cp), IsNil) c.Assert(cp.Engines, DeepEquals, map[int32]*EngineCheckpoint{ -1: { From 224906c728b3d558b41d651127dfb3ef9fe165d6 Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 9 Mar 2021 11:07:03 +0800 Subject: [PATCH 04/32] reuse task id when recover from checkpoint --- pkg/lightning/restore/restore.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 0b495dd80..0812e11fb 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -220,6 +220,10 @@ func NewRestoreControllerWithPauser( if err := verifyCheckpoint(cfg, taskCp); err != nil { return nil, errors.Trace(err) } + // reuse task id to reuse task meta correctly. + if taskCp != nil { + cfg.TaskID = taskCp.TaskId + } var backend kv.Backend switch cfg.TikvImporter.Backend { From bcba06671733a41fd68a07dcf649ebe0d0ba6bac Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 10 Mar 2021 10:17:09 +0800 Subject: [PATCH 05/32] fix lint --- pkg/lightning/restore/restore.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 0812e11fb..0dbcb05d6 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -2867,7 +2867,6 @@ func (m *tableMetaMgr) updateTableStatus(ctx context.Context, status metaStatus) } func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { - var ( baseTotalKvs, baseTotalBytes, baseChecksum uint64 taskKvs, taskBytes, taskChecksum uint64 @@ -2930,8 +2929,8 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum return rows.Err() } - query = fmt.Sprintf("update mysql.brie_sub_tasks set total_kvs = %d, total_bytes = %d, checksum = %d where table_id = %d and id = %d", - checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), m.tr.tableInfo.ID, m.taskID) + query = fmt.Sprintf("update mysql.brie_sub_tasks set total_kvs = %d, total_bytes = %d, checksum = %d, status = '%s' where table_id = %d and id = %d", + checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus, m.tr.tableInfo.ID, m.taskID) _, err = tx.ExecContext(ctx, query) return errors.Trace(err) From 03399d769ca11e6ad389a361b388916a1e6bceaf Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 10 Mar 2021 10:39:15 +0800 Subject: [PATCH 06/32] fix meta table field name --- pkg/lightning/restore/restore.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 0dbcb05d6..112ae6b97 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -74,19 +74,19 @@ const ( const ( // CreateBRIESubJobTable stores the per-table sub jobs information used by TiDB Lightning CreateBRIESubJobTable = `CREATE TABLE IF NOT EXISTS mysql.brie_sub_tasks ( - id BIGINT(20) UNSIGNED, + task_id BIGINT(20) UNSIGNED, table_id BIGINT(64) NOT NULL, table_name VARCHAR(64) NOT NULL, row_id_base BIGINT(20) NOT NULL DEFAULT 0, row_id_max BIGINT(20) NOT NULL DEFAULT 0, - base_total_kvs BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, - base_total_bytes BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, - base_checksum BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + total_kvs_base BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + total_bytes_base BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, + checksum_base BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, total_kvs BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, total_bytes BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, checksum BIGINT(20) UNSIGNED NOT NULL DEFAULT 0, status VARCHAR(32) NOT NULL, - PRIMARY KEY (table_id, id) + PRIMARY KEY (table_id, task_id) );` ) @@ -2853,7 +2853,7 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r } func (m *tableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error { - query := fmt.Sprintf("update mysql.brie_sub_tasks set kv_kvs_base = %d, kv_bytes_base = %d, checksum_base = %d, status = '%s' where table_id = %d and task_id = %d", + query := fmt.Sprintf("update mysql.brie_sub_tasks set total_kvs_base = %d, total_bytes_base = %d, checksum_base = %d, status = '%s' where table_id = %d and task_id = %d", checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), m.tr.tableInfo.ID, m.taskID) return m.session.Exec(ctx, "update base checksum", query) @@ -2929,7 +2929,7 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum return rows.Err() } - query = fmt.Sprintf("update mysql.brie_sub_tasks set total_kvs = %d, total_bytes = %d, checksum = %d, status = '%s' where table_id = %d and id = %d", + query = fmt.Sprintf("update mysql.brie_sub_tasks set total_kvs = %d, total_bytes = %d, checksum = %d, status = '%s' where table_id = %d and task_id = %d", checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus, m.tr.tableInfo.ID, m.taskID) _, err = tx.ExecContext(ctx, query) @@ -2948,6 +2948,6 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum } func (m *tableMetaMgr) FinishTable(ctx context.Context) error { - return m.session.Exec(ctx, "clean up metas", "DELETE FROM mysql.brie_sub_tasks where table_id = ? and id = ? and (status = 'checksuming' or status = 'checksum_skipped')", + return m.session.Exec(ctx, "clean up metas", "DELETE FROM mysql.brie_sub_tasks where table_id = ? and task_id = ? and (status = 'checksuming' or status = 'checksum_skipped')", m.tr.tableInfo.ID, m.taskID) } From 6d89f9dc0d7fae30790e0e600bd816dca64383a7 Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 10 Mar 2021 15:39:42 +0800 Subject: [PATCH 07/32] fix integration tests --- .../data/incr.rowid_uk_inc-schema.sql | 2 +- tests/lightning_incremental/data/incr.rowid_uk_inc.sql | 2 +- .../data/incr.uk_auto_inc-schema.sql | 4 ++-- tests/lightning_incremental/data/incr.uk_auto_inc.sql | 2 +- .../data1/incr.rowid_uk_inc-schema.sql | 2 +- tests/lightning_incremental/data1/incr.rowid_uk_inc.sql | 2 +- .../data1/incr.uk_auto_inc-schema.sql | 4 ++-- tests/lightning_incremental/data1/incr.uk_auto_inc.sql | 2 +- tests/lightning_incremental/run.sh | 8 ++++---- tests/lightning_tidb_rowid/run.sh | 9 ++------- 10 files changed, 16 insertions(+), 21 deletions(-) diff --git a/tests/lightning_incremental/data/incr.rowid_uk_inc-schema.sql b/tests/lightning_incremental/data/incr.rowid_uk_inc-schema.sql index 5beb69550..c1ace8ba9 100644 --- a/tests/lightning_incremental/data/incr.rowid_uk_inc-schema.sql +++ b/tests/lightning_incremental/data/incr.rowid_uk_inc-schema.sql @@ -1,4 +1,4 @@ CREATE TABLE `rowid_uk_inc` ( `id` bigint UNIQUE KEY AUTO_INCREMENT, - s varchar(16), + v varchar(16) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data/incr.rowid_uk_inc.sql b/tests/lightning_incremental/data/incr.rowid_uk_inc.sql index 91831a9ff..b90acb9b1 100644 --- a/tests/lightning_incremental/data/incr.rowid_uk_inc.sql +++ b/tests/lightning_incremental/data/incr.rowid_uk_inc.sql @@ -1,5 +1,5 @@ /*!40103 SET TIME_ZONE='+00:00' */; -INSERT INTO `rowid_uk_inc` (`id`) VALUES +INSERT INTO `rowid_uk_inc` (`v`) VALUES ('a'), ('b'), ('c'); diff --git a/tests/lightning_incremental/data/incr.uk_auto_inc-schema.sql b/tests/lightning_incremental/data/incr.uk_auto_inc-schema.sql index d38886b60..3901d7ed3 100644 --- a/tests/lightning_incremental/data/incr.uk_auto_inc-schema.sql +++ b/tests/lightning_incremental/data/incr.uk_auto_inc-schema.sql @@ -1,4 +1,4 @@ CREATE TABLE `uk_auto_inc` ( - `u` int PRIMARY KEY, - `id` bigint UNIQUE KEY AUTO_INCREMENT, + `pk` int PRIMARY KEY, + `id` bigint UNIQUE KEY AUTO_INCREMENT ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data/incr.uk_auto_inc.sql b/tests/lightning_incremental/data/incr.uk_auto_inc.sql index 3384ed652..4b1e7b134 100644 --- a/tests/lightning_incremental/data/incr.uk_auto_inc.sql +++ b/tests/lightning_incremental/data/incr.uk_auto_inc.sql @@ -1,5 +1,5 @@ /*!40103 SET TIME_ZONE='+00:00' */; -INSERT INTO `uk_auto_inc` (`u`) VALUES +INSERT INTO `uk_auto_inc` (`pk`) VALUES (1), (2), (3); diff --git a/tests/lightning_incremental/data1/incr.rowid_uk_inc-schema.sql b/tests/lightning_incremental/data1/incr.rowid_uk_inc-schema.sql index cf5bb71fc..c9bc49801 100644 --- a/tests/lightning_incremental/data1/incr.rowid_uk_inc-schema.sql +++ b/tests/lightning_incremental/data1/incr.rowid_uk_inc-schema.sql @@ -1,4 +1,4 @@ CREATE TABLE `uk_auto_inc` ( `id` bigint UNIQUE KEY AUTO_INCREMENT, - s varchar(16), + v varchar(16) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data1/incr.rowid_uk_inc.sql b/tests/lightning_incremental/data1/incr.rowid_uk_inc.sql index 5938fc484..f4ab9a5a7 100644 --- a/tests/lightning_incremental/data1/incr.rowid_uk_inc.sql +++ b/tests/lightning_incremental/data1/incr.rowid_uk_inc.sql @@ -1,5 +1,5 @@ /*!40103 SET TIME_ZONE='+00:00' */; -INSERT INTO `rowid_uk_inc` (`s`) VALUES +INSERT INTO `rowid_uk_inc` (`v`) VALUES ("d"), ("e"), ("f"); diff --git a/tests/lightning_incremental/data1/incr.uk_auto_inc-schema.sql b/tests/lightning_incremental/data1/incr.uk_auto_inc-schema.sql index d38886b60..3901d7ed3 100644 --- a/tests/lightning_incremental/data1/incr.uk_auto_inc-schema.sql +++ b/tests/lightning_incremental/data1/incr.uk_auto_inc-schema.sql @@ -1,4 +1,4 @@ CREATE TABLE `uk_auto_inc` ( - `u` int PRIMARY KEY, - `id` bigint UNIQUE KEY AUTO_INCREMENT, + `pk` int PRIMARY KEY, + `id` bigint UNIQUE KEY AUTO_INCREMENT ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data1/incr.uk_auto_inc.sql b/tests/lightning_incremental/data1/incr.uk_auto_inc.sql index ece660062..31d87c135 100644 --- a/tests/lightning_incremental/data1/incr.uk_auto_inc.sql +++ b/tests/lightning_incremental/data1/incr.uk_auto_inc.sql @@ -1,5 +1,5 @@ /*!40103 SET TIME_ZONE='+00:00' */; -INSERT INTO `pk_auto_inc` (`id`) VALUES +INSERT INTO `uk_auto_inc` (`pk`) VALUES (4), (5), (6); diff --git a/tests/lightning_incremental/run.sh b/tests/lightning_incremental/run.sh index 30c1ab58a..ed36c76d8 100644 --- a/tests/lightning_incremental/run.sh +++ b/tests/lightning_incremental/run.sh @@ -49,8 +49,8 @@ for backend in importer local; do check_contains "group_concat(v): a,b,c" done - run_sql "SELECT sum(u) from incr.uk_auto_inc;" - check_contains "sum(u): 6" + run_sql "SELECT sum(pk) from incr.uk_auto_inc;" + check_contains "sum(pk): 6" # incrementally import all data in data1 run_lightning --backend $backend -d "tests/$TEST_NAME/data1" @@ -76,6 +76,6 @@ for backend in importer local; do check_contains "group_concat(v): a,b,c,d,e,f" done - run_sql "SELECT sum(u) from incr.uk_auto_inc;" - check_contains "sum(u): 21" + run_sql "SELECT sum(pk) from incr.uk_auto_inc;" + check_contains "sum(pk): 21" done diff --git a/tests/lightning_tidb_rowid/run.sh b/tests/lightning_tidb_rowid/run.sh index 4397c2679..395c21978 100755 --- a/tests/lightning_tidb_rowid/run.sh +++ b/tests/lightning_tidb_rowid/run.sh @@ -57,13 +57,8 @@ for BACKEND in local importer tidb; do run_sql 'SELECT count(*), min(_tidb_rowid), max(_tidb_rowid) FROM rowid.pre_rebase' check_contains 'count(*): 1' - if [ "$BACKEND" == 'tidb' ]; then - check_contains 'min(_tidb_rowid): 70000' - check_contains 'max(_tidb_rowid): 70000' - else - check_contains 'min(_tidb_rowid): 1' - check_contains 'max(_tidb_rowid): 1' - fi + check_contains 'min(_tidb_rowid): 70000' + check_contains 'max(_tidb_rowid): 70000' run_sql 'INSERT INTO rowid.pre_rebase VALUES ("?")' run_sql 'SELECT _tidb_rowid > 70000 FROM rowid.pre_rebase WHERE pk = "?"' check_contains '_tidb_rowid > 70000: 1' From bdcbf3bb89fad75c0aeeabcefd7ed66640d70978 Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 10 Mar 2021 20:23:17 +0800 Subject: [PATCH 08/32] fix rowid check for auto-random --- pkg/lightning/restore/restore.go | 63 ++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 19 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 112ae6b97..26fc77fa9 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -29,6 +29,7 @@ import ( "github.com/pingcap/failpoint" sstpb "github.com/pingcap/kvproto/pkg/import_sstpb" "github.com/pingcap/parser/model" + "github.com/pingcap/parser/mysql" "github.com/pingcap/tidb/meta/autoid" "github.com/pingcap/tidb/table" "github.com/pingcap/tidb/table/tables" @@ -1301,7 +1302,7 @@ func (t *TableRestore) restoreTable( // "show table next_row_id" is only available after v4.0.0 if version.Major >= 4 && rc.cfg.TikvImporter.Backend != config.BackendTiDB && - (common.TableHasAutoRowID(t.tableInfo.Core) || t.tableInfo.Core.GetAutoIncrementColInfo() != nil) { + (common.TableHasAutoRowID(t.tableInfo.Core) || t.tableInfo.Core.GetAutoIncrementColInfo() != nil || t.tableInfo.Core.ContainsAutoRandomBits()) { // first, insert a new-line into meta table if err = metaMgr.InitTableMeta(ctx); err != nil { return false, err @@ -2784,14 +2785,22 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r continue } - if metaLightningID == m.taskID && status >= metaStatusRowIDAllocated { - if rowIDMax-rowIDBase != rawRowIDMax { - return errors.Errorf("verify allocator base failed. local: '%d', meta: '%d'", rawRowIDMax, rowIDMax-rowIDBase) + if metaLightningID == m.taskID { + if status >= metaStatusRowIDAllocated { + if rowIDMax-rowIDBase != rawRowIDMax { + return errors.Errorf("verify allocator base failed. local: '%d', meta: '%d'", rawRowIDMax, rowIDMax-rowIDBase) + } + newRowIDBase = rowIDBase + newRowIDMax = rowIDMax + curStatus = status + break } - newRowIDBase = rowIDBase - newRowIDMax = rowIDMax - curStatus = status - break + continue + } + + // other tasks has finished this logic, we needn't do again. + if status >= metaStatusRowIDAllocated { + newStatus = metaStatusRestoreStarted } if rowIDMax > maxRowIDMax { @@ -2802,20 +2811,36 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r // no enough info are available, fetch row_id max for table if curStatus == metaStatusInitial { if maxRowIDMax == 0 { + // NOTE: currently, if a table contains auto_incremental unique key and _tidb_rowid, + // the `show table next_row_id` will returns the unique key field only. + var autoIDField string + for _, col := range tr.tableInfo.Core.Columns { + if mysql.HasAutoIncrementFlag(col.Flag) { + autoIDField = col.Name.L + break + } + } + if len(autoIDField) == 0 && common.TableHasAutoRowID(tr.tableInfo.Core) { + autoIDField = model.ExtraHandleName.L + } + if len(autoIDField) == 0 { + return errors.Errorf("table %s contains auto increment id or _tidb_rowid, but target field not found", tr.tableName) + } + autoIDInfos, err := kv.FetchTableAutoIDInfos(ctx, tx, m.tr.tableName) if err != nil { return errors.Trace(err) } - if len(autoIDInfos) == 1 { - maxRowIDMax = autoIDInfos[0].NextID - 1 - // if table does not contain data, we can skip do pre-checksum - if maxRowIDMax == 0 { - newStatus = metaStatusRestoreStarted + found := false + for _, info := range autoIDInfos { + if strings.ToLower(info.Column) == autoIDField { + maxRowIDMax = info.NextID - 1 + found = true + break } - } else if len(autoIDInfos) == 0 { - return errors.New("can't fetch previous auto id base") - } else { - return errors.New("not supported: more than one auto id allocator found") + } + if !found { + return errors.Errorf("can't fetch previous auto id base for table %s field '%s'", tr.tableName, autoIDField) } } newRowIDBase = maxRowIDMax @@ -2829,8 +2854,8 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r }) var checksum *verify.KVChecksum - // need to do checksum and update checksum meta - if newStatus < metaStatusRestoreStarted { + // need to do checksum and update checksum meta since we are the first one. + if curStatus < metaStatusRestoreStarted && newStatus < metaStatusRestoreStarted { // table contains data but haven't do checksum yet if newRowIDBase > 0 && baseTotalKvs == 0 { baseChecksum, err := DoChecksum(ctx, tr.tableInfo) From d7d0db1a62f18f1dfee9426db4f38a07106b85e6 Mon Sep 17 00:00:00 2001 From: glorv Date: Fri, 12 Mar 2021 13:16:46 +0800 Subject: [PATCH 09/32] fix version --- pkg/lightning/restore/restore.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 010d8dc2a..76c90fcef 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -53,6 +53,7 @@ import ( "github.com/pingcap/br/pkg/pdutil" "github.com/pingcap/br/pkg/storage" "github.com/pingcap/br/pkg/utils" + "github.com/pingcap/br/pkg/version" "github.com/pingcap/br/pkg/version/build" ) @@ -1304,7 +1305,7 @@ func (t *TableRestore) restoreTable( return false, errors.Trace(err) } - version, err := common.ExtractTiDBVersion(versionStr) + tidbVersion, err := version.ExtractTiDBVersion(versionStr) if err != nil { return false, errors.Trace(err) } @@ -1322,7 +1323,7 @@ func (t *TableRestore) restoreTable( } // "show table next_row_id" is only available after v4.0.0 - if version.Major >= 4 && rc.cfg.TikvImporter.Backend != config.BackendTiDB && + if tidbVersion.Major >= 4 && rc.cfg.TikvImporter.Backend != config.BackendTiDB && (common.TableHasAutoRowID(t.tableInfo.Core) || t.tableInfo.Core.GetAutoIncrementColInfo() != nil || t.tableInfo.Core.ContainsAutoRandomBits()) { // first, insert a new-line into meta table if err = metaMgr.InitTableMeta(ctx); err != nil { From a55d286fd971bed3ccfcfa9970ed733ae609b4b6 Mon Sep 17 00:00:00 2001 From: glorv Date: Mon, 15 Mar 2021 12:16:31 +0800 Subject: [PATCH 10/32] fix auto random --- pkg/lightning/restore/restore.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 76c90fcef..9794c6e9c 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -2843,6 +2843,9 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r if mysql.HasAutoIncrementFlag(col.Flag) { autoIDField = col.Name.L break + } else if mysql.HasPriKeyFlag(col.Flag) && tr.tableInfo.Core.AutoRandomBits > 0 { + autoIDField = col.Name.L + break } } if len(autoIDField) == 0 && common.TableHasAutoRowID(tr.tableInfo.Core) { @@ -2998,6 +3001,6 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum } func (m *tableMetaMgr) FinishTable(ctx context.Context) error { - return m.session.Exec(ctx, "clean up metas", "DELETE FROM mysql.brie_sub_tasks where table_id = ? and task_id = ? and (status = 'checksuming' or status = 'checksum_skipped')", - m.tr.tableInfo.ID, m.taskID) + return m.session.Exec(ctx, "clean up metas", "DELETE FROM mysql.brie_sub_tasks where table_id = ? and (status = 'checksuming' or status = 'checksum_skipped')", + m.tr.tableInfo.ID) } From 0c4d45f6992f81ee4512cf6aacff3e97f891f591 Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 16 Mar 2021 13:23:02 +0800 Subject: [PATCH 11/32] fix allocate row_id --- pkg/lightning/backend/tidb.go | 3 +- pkg/lightning/common/util.go | 14 +++++- pkg/lightning/glue/glue.go | 9 ---- pkg/lightning/restore/restore.go | 57 +++++++++++++++++------ pkg/lightning/restore/restore_test.go | 66 +++++++++++++++++++++++++++ 5 files changed, 124 insertions(+), 25 deletions(-) diff --git a/pkg/lightning/backend/tidb.go b/pkg/lightning/backend/tidb.go index 204b50a16..517494a76 100644 --- a/pkg/lightning/backend/tidb.go +++ b/pkg/lightning/backend/tidb.go @@ -34,7 +34,6 @@ import ( "github.com/pingcap/br/pkg/lightning/common" "github.com/pingcap/br/pkg/lightning/config" - "github.com/pingcap/br/pkg/lightning/glue" "github.com/pingcap/br/pkg/lightning/log" "github.com/pingcap/br/pkg/lightning/verification" "github.com/pingcap/br/pkg/version" @@ -560,7 +559,7 @@ type TableAutoIDInfo struct { Type string } -func FetchTableAutoIDInfos(ctx context.Context, exec glue.QueryExecutor, tableName string) ([]*TableAutoIDInfo, error) { +func FetchTableAutoIDInfos(ctx context.Context, exec common.QueryExecutor, tableName string) ([]*TableAutoIDInfo, error) { rows, e := exec.QueryContext(ctx, fmt.Sprintf("SHOW TABLE %s NEXT_ROW_ID", tableName)) if e != nil { return nil, errors.Trace(e) diff --git a/pkg/lightning/common/util.go b/pkg/lightning/common/util.go index 562335e29..d10e22fd4 100644 --- a/pkg/lightning/common/util.go +++ b/pkg/lightning/common/util.go @@ -99,9 +99,21 @@ func IsEmptyDir(name string) bool { return len(entries) == 0 } +type QueryExecutor interface { + QueryContext(ctx context.Context, query string, args ...interface{}) (*sql.Rows, error) + QueryRowContext(ctx context.Context, query string, args ...interface{}) *sql.Row +} + +type DBExecutor interface { + QueryExecutor + BeginTx(ctx context.Context, opts *sql.TxOptions) (*sql.Tx, error) + ExecContext(ctx context.Context, query string, args ...interface{}) (sql.Result, error) +} + // SQLWithRetry constructs a retryable transaction. type SQLWithRetry struct { - DB *sql.DB + // either *sql.DB or *sql.Conn + DB DBExecutor Logger log.Logger HideQueryLog bool } diff --git a/pkg/lightning/glue/glue.go b/pkg/lightning/glue/glue.go index 592d0c8a7..cf3a17f5d 100644 --- a/pkg/lightning/glue/glue.go +++ b/pkg/lightning/glue/glue.go @@ -189,12 +189,3 @@ const ( RecordEstimatedChunk = "EstimatedChunk" RecordFinishedChunk = "FinishedChunk" ) - -type QueryExecutor interface { - QueryContext(ctx context.Context, query string, args ...interface{}) (*sql.Rows, error) -} - -type DBExecutor interface { - QueryExecutor - BeginTx(ctx context.Context, opts *sql.TxOptions) (*sql.Tx, error) -} diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index c23837904..55d68edfa 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -1241,10 +1241,15 @@ func (rc *RestoreController) restoreTables(ctx context.Context) error { restoreErr.Set(err) continue } + conn, err := db.Conn(ctx) + if err != nil { + restoreErr.Set(err) + continue + } metaMgr := &tableMetaMgr{ session: common.SQLWithRetry{ - DB: db, + DB: conn, Logger: task.tr.logger, }, taskID: rc.cfg.TaskID, @@ -1282,10 +1287,14 @@ func (t *TableRestore) restoreTable( if err != nil { return false, errors.Trace(err) } + conn, err := db.Conn(ctx) + if err != nil { + return false, errors.Trace(err) + } metaMgr := &tableMetaMgr{ session: common.SQLWithRetry{ - DB: db, + DB: conn, Logger: t.logger, }, taskID: rc.cfg.TaskID, @@ -2788,8 +2797,12 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r curStatus := metaStatusInitial newStatus := metaStatusRowIDAllocated var baseTotalKvs, baseTotalBytes, baseChecksum uint64 - err := m.session.Transact(ctx, "init table allocator base", func(ctx context.Context, tx *sql.Tx) error { - query := fmt.Sprintf("SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from mysql.brie_sub_tasks WHERE table_id = ?") + err := m.session.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") + if err != nil { + return nil, 0, errors.Annotate(err, "enable pessimistic transaction failed") + } + err = m.session.Transact(ctx, "init table allocator base", func(ctx context.Context, tx *sql.Tx) error { + query := fmt.Sprintf("SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from mysql.brie_sub_tasks WHERE table_id = ? FOR UPDATE") rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) if err != nil { return errors.Trace(err) @@ -2813,13 +2826,13 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r } if metaLightningID == m.taskID { + curStatus = status if status >= metaStatusRowIDAllocated { if rowIDMax-rowIDBase != rawRowIDMax { return errors.Errorf("verify allocator base failed. local: '%d', meta: '%d'", rawRowIDMax, rowIDMax-rowIDBase) } newRowIDBase = rowIDBase newRowIDMax = rowIDMax - curStatus = status break } continue @@ -2875,17 +2888,33 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r } newRowIDBase = maxRowIDMax newRowIDMax = newRowIDBase + rawRowIDMax - query = fmt.Sprintf("update mysql.brie_sub_tasks set row_id_base = %d, row_id_max = %d, status = '%s', where table_id = %d and task_id = %d", - newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, metaLightningID) - _, err = tx.ExecContext(ctx, query) - return errors.Trace(err) + // table contains no data, can skip checksum + if newRowIDBase == 0 && newStatus < metaStatusRestoreStarted { + newStatus = metaStatusRestoreStarted + } + query = "update mysql.brie_sub_tasks set row_id_base = ?, row_id_max = ?, status = ?, where table_id = ? and task_id = ?" + res, err := tx.ExecContext(ctx, query, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, metaLightningID) + if err != nil { + return errors.Trace(err) + } + rowCnt, err := res.RowsAffected() + if err != nil { + return errors.Trace(err) + } + if rowCnt != 1 { + return errors.New("exec query failed") + } + curStatus = newStatus } return nil }) + if err != nil { + return nil, 0, errors.Trace(err) + } var checksum *verify.KVChecksum // need to do checksum and update checksum meta since we are the first one. - if curStatus < metaStatusRestoreStarted && newStatus < metaStatusRestoreStarted { + if curStatus < metaStatusRestoreStarted { // table contains data but haven't do checksum yet if newRowIDBase > 0 && baseTotalKvs == 0 { baseChecksum, err := DoChecksum(ctx, tr.tableInfo) @@ -2903,6 +2932,9 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r tr.logger.Info("checksum before restore table", zap.Object("checksum", checksum)) } + if err = m.updateTableStatus(ctx, metaStatusRestoreStarted); err != nil { + return nil, 0, errors.Trace(err) + } } return checksum, newRowIDBase, nil } @@ -2984,10 +3016,9 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum return rows.Err() } - query = fmt.Sprintf("update mysql.brie_sub_tasks set total_kvs = %d, total_bytes = %d, checksum = %d, status = '%s' where table_id = %d and task_id = %d", - checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus, m.tr.tableInfo.ID, m.taskID) + query = "update mysql.brie_sub_tasks set total_kvs = ?, total_bytes = ?, checksum = ?, status = ? where table_id = ? and task_id = ?" - _, err = tx.ExecContext(ctx, query) + _, err = tx.ExecContext(ctx, query, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus, m.tr.tableInfo.ID, m.taskID) return errors.Trace(err) }) if err != nil { diff --git a/pkg/lightning/restore/restore_test.go b/pkg/lightning/restore/restore_test.go index 70865a950..4ed59dfc6 100644 --- a/pkg/lightning/restore/restore_test.go +++ b/pkg/lightning/restore/restore_test.go @@ -16,6 +16,7 @@ package restore import ( "context" "fmt" + "go.uber.org/zap" "io/ioutil" "path/filepath" "sort" @@ -1307,3 +1308,68 @@ func (s *restoreSchemaSuite) TestRestoreSchemaContextCancel(c *C) { c.Assert(err, NotNil) c.Assert(err, Equals, childCtx.Err()) } + +func (s *restoreSuite) TestAllocTableRowIDs(c *C) { + p := parser.New() + se := tmock.NewContext() + + ctx := context.Background() + + db, m, err := sqlmock.New() + c.Assert(err, IsNil) + conn, err := db.Conn(ctx) + c.Assert(err, IsNil) + + node, err := p.ParseOneStmt("CREATE TABLE `t1` (`c1` varchar(5) NOT NULL)", "utf8mb4", "utf8mb4_bin") + c.Assert(err, IsNil) + tableInfo, err := ddl.MockTableInfo(se, node.(*ast.CreateTableStmt), int64(1)) + c.Assert(err, IsNil) + tableInfo.State = model.StatePublic + + schema := "test" + tb := "t1" + ti := &TidbTableInfo{ + ID: tableInfo.ID, + DB: schema, + Name: tb, + Core: tableInfo, + } + + tableName := common.UniqueTable(schema, tb) + logger := log.With(zap.String("table", tableName)) + tr := &TableRestore{ + tableName: tableName, + tableInfo: ti, + logger: logger, + } + + mgr := &tableMetaMgr{ + session: common.SQLWithRetry{ + DB: conn, + Logger: logger, + }, + taskID: 1, + tr: tr, + } + + m.ExpectExec("SET SESSION tidb_txn_mode = 'pessimistic';"). + WillReturnResult(sqlmock.NewResult(int64(0), int64(0))) + + m.ExpectBegin() + m.ExpectQuery("\\QSELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from mysql.brie_sub_tasks WHERE table_id = ? FOR UPDATE\\E"). + WithArgs(int64(1)). + WillReturnRows(sqlmock.NewRows([]string{"task_id", "row_id_base", "row_id_max", "total_kvs_base", "total_bytes_base", "checksum_base", "status"}). + AddRow("1", int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized")) + m.ExpectQuery("SHOW TABLE `test`.`t1` NEXT_ROW_ID"). + WillReturnRows(sqlmock.NewRows([]string{"DB_NAME", "TABLE_NAME", "COLUMN_NAME", "NEXT_GLOBAL_ROW_ID", "ID_TYPE"}). + AddRow("test", "t1", "_tidb_rowid", int64(1), "AUTO_INCREMENT")) + m.ExpectExec("\\Qupdate mysql.brie_sub_tasks set row_id_base = ?, row_id_max = ?, status = ?, where table_id = ? and task_id = ?\\E"). + WithArgs(int64(0), int64(10), "restore", int64(1), int64(1)). + WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) + m.ExpectCommit() + ck, rowIDBase, err := mgr.AllocTableRowIDs(ctx, tr, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(0)) + c.Assert(ck, IsNil) + +} From 4c977d17b17e06f586b47f1810ba5328871e65eb Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 16 Mar 2021 17:59:26 +0800 Subject: [PATCH 12/32] fix --- pkg/lightning/restore/restore.go | 105 ++++++++++++++++---------- pkg/lightning/restore/restore_test.go | 15 ++-- pkg/lightning/restore/tidb.go | 1 + 3 files changed, 70 insertions(+), 51 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 55d68edfa..aabbd1e4d 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -1230,36 +1230,27 @@ func (rc *RestoreController) restoreTables(ctx context.Context) error { } close(postProcessTaskChan) + // TODO: support Lightning via SQL + db, err := rc.tidbGlue.GetDB() + if err != nil { + return errors.Trace(err) + } + // otherwise, we should run all tasks in the post-process task chan for i := 0; i < rc.cfg.App.TableConcurrency; i++ { wg.Add(1) go func() { + defer wg.Done() for task := range postProcessTaskChan { - // TODO: support Lightning via SQL - db, err := rc.tidbGlue.GetDB() - if err != nil { - restoreErr.Set(err) - continue - } - conn, err := db.Conn(ctx) - if err != nil { - restoreErr.Set(err) - continue - } - metaMgr := &tableMetaMgr{ - session: common.SQLWithRetry{ - DB: conn, - Logger: task.tr.logger, - }, - taskID: rc.cfg.TaskID, - tr: task.tr, + session: db, + taskID: rc.cfg.TaskID, + tr: task.tr, } // force all the remain post-process tasks to be executed _, err = task.tr.postProcess(ctx2, rc, task.cp, true, metaMgr) restoreErr.Set(err) } - wg.Done() }() } wg.Wait() @@ -1287,18 +1278,11 @@ func (t *TableRestore) restoreTable( if err != nil { return false, errors.Trace(err) } - conn, err := db.Conn(ctx) - if err != nil { - return false, errors.Trace(err) - } metaMgr := &tableMetaMgr{ - session: common.SQLWithRetry{ - DB: conn, - Logger: t.logger, - }, - taskID: rc.cfg.TaskID, - tr: t, + session: db, + taskID: rc.cfg.TaskID, + tr: t, } // no need to do anything if the chunks are already populated @@ -2726,16 +2710,20 @@ func (cr *chunkRestore) restore( } type tableMetaMgr struct { - session common.SQLWithRetry + session *sql.DB taskID int64 tr *TableRestore } func (m *tableMetaMgr) InitTableMeta(ctx context.Context) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: m.tr.logger, + } // avoid override existing metadata if the meta is already inserted. stmt := `INSERT IGNORE INTO mysql.brie_sub_tasks (task_id, table_id, table_name, status) values (?, ?, ?, ?)` task := m.tr.logger.Begin(zap.DebugLevel, "init table meta") - err := m.session.Exec(ctx, "init table meta", stmt, m.taskID, m.tr.tableInfo.ID, m.tr.tableName, metaStatusInitial.String()) + err := exec.Exec(ctx, "init table meta", stmt, m.taskID, m.tr.tableInfo.ID, m.tr.tableName, metaStatusInitial.String()) task.End(zap.ErrorLevel, err) return errors.Trace(err) } @@ -2793,20 +2781,30 @@ func parseMetaStatus(s string) (metaStatus, error) { } func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, rawRowIDMax int64) (*verify.KVChecksum, int64, error) { + conn, err := m.session.Conn(ctx) + if err != nil { + return nil, 0, errors.Trace(err) + } + defer conn.Close() + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: m.tr.logger, + } var newRowIDBase, newRowIDMax int64 curStatus := metaStatusInitial newStatus := metaStatusRowIDAllocated var baseTotalKvs, baseTotalBytes, baseChecksum uint64 - err := m.session.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") + err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") if err != nil { return nil, 0, errors.Annotate(err, "enable pessimistic transaction failed") } - err = m.session.Transact(ctx, "init table allocator base", func(ctx context.Context, tx *sql.Tx) error { + err = exec.Transact(ctx, "init table allocator base", func(ctx context.Context, tx *sql.Tx) error { query := fmt.Sprintf("SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from mysql.brie_sub_tasks WHERE table_id = ? FOR UPDATE") rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) if err != nil { return errors.Trace(err) } + defer rows.Close() var ( metaLightningID, rowIDBase, rowIDMax, maxRowIDMax int64 statusValue string @@ -2892,7 +2890,7 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r if newRowIDBase == 0 && newStatus < metaStatusRestoreStarted { newStatus = metaStatusRestoreStarted } - query = "update mysql.brie_sub_tasks set row_id_base = ?, row_id_max = ?, status = ?, where table_id = ? and task_id = ?" + query = "update mysql.brie_sub_tasks set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?" res, err := tx.ExecContext(ctx, query, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, metaLightningID) if err != nil { return errors.Trace(err) @@ -2940,20 +2938,41 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r } func (m *tableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: m.tr.logger, + } query := fmt.Sprintf("update mysql.brie_sub_tasks set total_kvs_base = %d, total_bytes_base = %d, checksum_base = %d, status = '%s' where table_id = %d and task_id = %d", checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), m.tr.tableInfo.ID, m.taskID) - return m.session.Exec(ctx, "update base checksum", query) + return exec.Exec(ctx, "update base checksum", query) } func (m *tableMetaMgr) updateTableStatus(ctx context.Context, status metaStatus) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: m.tr.logger, + } query := fmt.Sprintf("update mysql.brie_sub_tasks set status = '%s' where table_id = %d and task_id = %d", status.String(), m.tr.tableInfo.ID, m.taskID) - return m.session.Exec(ctx, "update meta status", query) + return exec.Exec(ctx, "update meta status", query) } func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { + conn, err := m.session.Conn(ctx) + if err != nil { + return false, nil, errors.Trace(err) + } + defer conn.Close() + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: m.tr.logger, + } + err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") + if err != nil { + return false, nil, errors.Annotate(err, "enable pessimistic transaction failed") + } var ( baseTotalKvs, baseTotalBytes, baseChecksum uint64 taskKvs, taskBytes, taskChecksum uint64 @@ -2961,12 +2980,13 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum ) newStatus := metaStatusChecksuming needChecksum := true - err := m.session.Transact(ctx, "checksum pre-check", func(ctx context.Context, tx *sql.Tx) error { - query := fmt.Sprintf("SELECT task_id, total_kvs_base, total_bytes_base, checksum_base, total_kvs, total_bytes, checksum, status from mysql.brie_sub_tasks WHERE table_id = ?") + err = exec.Transact(ctx, "checksum pre-check", func(ctx context.Context, tx *sql.Tx) error { + query := fmt.Sprintf("SELECT task_id, total_kvs_base, total_bytes_base, checksum_base, total_kvs, total_bytes, checksum, status from mysql.brie_sub_tasks WHERE table_id = ? FOR UPDATE") rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) if err != nil { return errors.Trace(err) } + defer rows.Close() var ( taskID int64 statusValue string @@ -3017,7 +3037,6 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum } query = "update mysql.brie_sub_tasks set total_kvs = ?, total_bytes = ?, checksum = ?, status = ? where table_id = ? and task_id = ?" - _, err = tx.ExecContext(ctx, query, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus, m.tr.tableInfo.ID, m.taskID) return errors.Trace(err) }) @@ -3034,6 +3053,10 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum } func (m *tableMetaMgr) FinishTable(ctx context.Context) error { - return m.session.Exec(ctx, "clean up metas", "DELETE FROM mysql.brie_sub_tasks where table_id = ? and (status = 'checksuming' or status = 'checksum_skipped')", - m.tr.tableInfo.ID) + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: m.tr.logger, + } + query := "DELETE FROM mysql.brie_sub_tasks where table_id = ? and (status = 'checksuming' or status = 'checksum_skipped')" + return exec.Exec(ctx, "clean up metas", query, m.tr.tableInfo.ID) } diff --git a/pkg/lightning/restore/restore_test.go b/pkg/lightning/restore/restore_test.go index 4ed59dfc6..a56a0dbdc 100644 --- a/pkg/lightning/restore/restore_test.go +++ b/pkg/lightning/restore/restore_test.go @@ -16,7 +16,6 @@ package restore import ( "context" "fmt" - "go.uber.org/zap" "io/ioutil" "path/filepath" "sort" @@ -35,6 +34,7 @@ import ( filter "github.com/pingcap/tidb-tools/pkg/table-filter" "github.com/pingcap/tidb/ddl" tmock "github.com/pingcap/tidb/util/mock" + "go.uber.org/zap" kv "github.com/pingcap/br/pkg/lightning/backend" "github.com/pingcap/br/pkg/lightning/checkpoints" @@ -1317,8 +1317,6 @@ func (s *restoreSuite) TestAllocTableRowIDs(c *C) { db, m, err := sqlmock.New() c.Assert(err, IsNil) - conn, err := db.Conn(ctx) - c.Assert(err, IsNil) node, err := p.ParseOneStmt("CREATE TABLE `t1` (`c1` varchar(5) NOT NULL)", "utf8mb4", "utf8mb4_bin") c.Assert(err, IsNil) @@ -1344,12 +1342,9 @@ func (s *restoreSuite) TestAllocTableRowIDs(c *C) { } mgr := &tableMetaMgr{ - session: common.SQLWithRetry{ - DB: conn, - Logger: logger, - }, - taskID: 1, - tr: tr, + session: db, + taskID: 1, + tr: tr, } m.ExpectExec("SET SESSION tidb_txn_mode = 'pessimistic';"). @@ -1363,7 +1358,7 @@ func (s *restoreSuite) TestAllocTableRowIDs(c *C) { m.ExpectQuery("SHOW TABLE `test`.`t1` NEXT_ROW_ID"). WillReturnRows(sqlmock.NewRows([]string{"DB_NAME", "TABLE_NAME", "COLUMN_NAME", "NEXT_GLOBAL_ROW_ID", "ID_TYPE"}). AddRow("test", "t1", "_tidb_rowid", int64(1), "AUTO_INCREMENT")) - m.ExpectExec("\\Qupdate mysql.brie_sub_tasks set row_id_base = ?, row_id_max = ?, status = ?, where table_id = ? and task_id = ?\\E"). + m.ExpectExec("\\Qupdate mysql.brie_sub_tasks set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?\\E"). WithArgs(int64(0), int64(10), "restore", int64(1), int64(1)). WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) m.ExpectCommit() diff --git a/pkg/lightning/restore/tidb.go b/pkg/lightning/restore/tidb.go index a672e7742..7dd2d7586 100644 --- a/pkg/lightning/restore/tidb.go +++ b/pkg/lightning/restore/tidb.go @@ -19,6 +19,7 @@ import ( "fmt" "strconv" "strings" + "time" tmysql "github.com/go-sql-driver/mysql" "github.com/pingcap/errors" From f818bffbcfcd482ba739565644c68e15864717ea Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 16 Mar 2021 21:56:17 +0800 Subject: [PATCH 13/32] fix close rows --- pkg/lightning/restore/restore.go | 37 ++++++++++++++++---------------- pkg/lightning/restore/tidb.go | 1 - 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index aabbd1e4d..674e5e4b8 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -2773,6 +2773,8 @@ func parseMetaStatus(s string) (metaStatus, error) { return metaStatusRestoreFinished, nil case "checksuming": return metaStatusChecksuming, nil + case "checksum_skipped": + return metaStatusChecksumSkipped, nil case "finish": return metaStatusFinished, nil default: @@ -2891,17 +2893,11 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r newStatus = metaStatusRestoreStarted } query = "update mysql.brie_sub_tasks set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?" - res, err := tx.ExecContext(ctx, query, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, metaLightningID) + _, err := tx.ExecContext(ctx, query, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, metaLightningID) if err != nil { return errors.Trace(err) } - rowCnt, err := res.RowsAffected() - if err != nil { - return errors.Trace(err) - } - if rowCnt != 1 { - return errors.New("exec query failed") - } + curStatus = newStatus } return nil @@ -2942,10 +2938,10 @@ func (m *tableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *ve DB: m.session, Logger: m.tr.logger, } - query := fmt.Sprintf("update mysql.brie_sub_tasks set total_kvs_base = %d, total_bytes_base = %d, checksum_base = %d, status = '%s' where table_id = %d and task_id = %d", - checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), m.tr.tableInfo.ID, m.taskID) + query := "update mysql.brie_sub_tasks set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?" - return exec.Exec(ctx, "update base checksum", query) + return exec.Exec(ctx, "update base checksum", query, checksum.SumKVS(), + checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), m.tr.tableInfo.ID, m.taskID) } func (m *tableMetaMgr) updateTableStatus(ctx context.Context, status metaStatus) error { @@ -2984,9 +2980,14 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum query := fmt.Sprintf("SELECT task_id, total_kvs_base, total_bytes_base, checksum_base, total_kvs, total_bytes, checksum, status from mysql.brie_sub_tasks WHERE table_id = ? FOR UPDATE") rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) if err != nil { - return errors.Trace(err) + return errors.Annotate(err, "fetch task meta failed") } - defer rows.Close() + closed := false + defer func() { + if !closed { + rows.Close() + } + }() var ( taskID int64 statusValue string @@ -3031,14 +3032,12 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum totalKvs += taskKvs totalChecksum ^= taskChecksum } - - if rows.Err() != nil { - return rows.Err() - } + rows.Close() + closed = true query = "update mysql.brie_sub_tasks set total_kvs = ?, total_bytes = ?, checksum = ?, status = ? where table_id = ? and task_id = ?" - _, err = tx.ExecContext(ctx, query, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus, m.tr.tableInfo.ID, m.taskID) - return errors.Trace(err) + _, err = tx.ExecContext(ctx, query, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus.String(), m.tr.tableInfo.ID, m.taskID) + return errors.Annotate(err, "update local checksum failed") }) if err != nil { return false, nil, err diff --git a/pkg/lightning/restore/tidb.go b/pkg/lightning/restore/tidb.go index 7dd2d7586..a672e7742 100644 --- a/pkg/lightning/restore/tidb.go +++ b/pkg/lightning/restore/tidb.go @@ -19,7 +19,6 @@ import ( "fmt" "strconv" "strings" - "time" tmysql "github.com/go-sql-driver/mysql" "github.com/pingcap/errors" From 7d64011137a0d836d856e1975c8234cc76b6ea41 Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 17 Mar 2021 11:05:46 +0800 Subject: [PATCH 14/32] add unit test and fix bug --- pkg/lightning/restore/restore.go | 51 +++--- pkg/lightning/restore/restore_test.go | 220 +++++++++++++++++++++++--- 2 files changed, 227 insertions(+), 44 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 674e5e4b8..156cb8134 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -1330,12 +1330,14 @@ func (t *TableRestore) restoreTable( t.RebaseChunkRowIDs(cp, rowIDBase) if checksum != nil { - cp.Checksum = *checksum - rc.saveCpCh <- saveCp{ - tableName: t.tableName, - merger: &TableChecksumMerger{ - Checksum: cp.Checksum, - }, + if cp.Checksum != *checksum { + cp.Checksum = *checksum + rc.saveCpCh <- saveCp{ + tableName: t.tableName, + merger: &TableChecksumMerger{ + Checksum: cp.Checksum, + }, + } } t.logger.Info("checksum before restore table", zap.Object("checksum", &cp.Checksum)) } @@ -2808,11 +2810,12 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r } defer rows.Close() var ( - metaLightningID, rowIDBase, rowIDMax, maxRowIDMax int64 - statusValue string + metaTaskID, rowIDBase, rowIDMax, maxRowIDMax int64 + totalKvs, totalBytes, checksum uint64 + statusValue string ) for rows.Next() { - if err = rows.Scan(&metaLightningID, &rowIDBase, &rowIDMax, &baseTotalKvs, &baseTotalBytes, &baseChecksum, &statusValue); err != nil { + if err = rows.Scan(&metaTaskID, &rowIDBase, &rowIDMax, &totalKvs, &totalBytes, &checksum, &statusValue); err != nil { return errors.Trace(err) } status, err := parseMetaStatus(statusValue) @@ -2825,8 +2828,11 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r continue } - if metaLightningID == m.taskID { + if metaTaskID == m.taskID { curStatus = status + baseChecksum = checksum + baseTotalKvs = totalKvs + baseTotalBytes = totalBytes if status >= metaStatusRowIDAllocated { if rowIDMax-rowIDBase != rawRowIDMax { return errors.Errorf("verify allocator base failed. local: '%d', meta: '%d'", rawRowIDMax, rowIDMax-rowIDBase) @@ -2893,7 +2899,7 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r newStatus = metaStatusRestoreStarted } query = "update mysql.brie_sub_tasks set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?" - _, err := tx.ExecContext(ctx, query, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, metaLightningID) + _, err := tx.ExecContext(ctx, query, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, m.taskID) if err != nil { return errors.Trace(err) } @@ -2911,12 +2917,16 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r if curStatus < metaStatusRestoreStarted { // table contains data but haven't do checksum yet if newRowIDBase > 0 && baseTotalKvs == 0 { - baseChecksum, err := DoChecksum(ctx, tr.tableInfo) + remoteCk, err := DoChecksum(ctx, tr.tableInfo) if err != nil { return nil, 0, errors.Trace(err) } - ck := verify.MakeKVChecksum(baseChecksum.TotalBytes, baseChecksum.TotalKVs, baseChecksum.Checksum) - checksum = &ck + + if remoteCk.Checksum != baseChecksum || remoteCk.TotalKVs != baseTotalKvs || remoteCk.TotalBytes != baseTotalBytes { + ck := verify.MakeKVChecksum(remoteCk.TotalBytes, remoteCk.TotalKVs, remoteCk.Checksum) + checksum = &ck + } + } if checksum != nil { @@ -2925,11 +2935,14 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r } tr.logger.Info("checksum before restore table", zap.Object("checksum", checksum)) - } - if err = m.updateTableStatus(ctx, metaStatusRestoreStarted); err != nil { + } else if err = m.updateTableStatus(ctx, metaStatusRestoreStarted); err != nil { return nil, 0, errors.Trace(err) } } + if checksum == nil && baseTotalKvs > 0 { + ck := verify.MakeKVChecksum(baseTotalBytes, baseTotalKvs, baseChecksum) + checksum = &ck + } return checksum, newRowIDBase, nil } @@ -2949,10 +2962,8 @@ func (m *tableMetaMgr) updateTableStatus(ctx context.Context, status metaStatus) DB: m.session, Logger: m.tr.logger, } - query := fmt.Sprintf("update mysql.brie_sub_tasks set status = '%s' where table_id = %d and task_id = %d", - status.String(), m.tr.tableInfo.ID, m.taskID) - - return exec.Exec(ctx, "update meta status", query) + query := "update mysql.brie_sub_tasks set status = ? where table_id = ? and task_id = ?" + return exec.Exec(ctx, "update meta status", query, status.String(), m.tr.tableInfo.ID, m.taskID) } func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { diff --git a/pkg/lightning/restore/restore_test.go b/pkg/lightning/restore/restore_test.go index a56a0dbdc..1efe39297 100644 --- a/pkg/lightning/restore/restore_test.go +++ b/pkg/lightning/restore/restore_test.go @@ -15,6 +15,8 @@ package restore import ( "context" + "database/sql" + "database/sql/driver" "fmt" "io/ioutil" "path/filepath" @@ -1309,14 +1311,29 @@ func (s *restoreSchemaSuite) TestRestoreSchemaContextCancel(c *C) { c.Assert(err, Equals, childCtx.Err()) } -func (s *restoreSuite) TestAllocTableRowIDs(c *C) { - p := parser.New() - se := tmock.NewContext() +type testChecksumMgr struct { + checksum RemoteChecksum + callCnt int +} - ctx := context.Background() +func (t *testChecksumMgr) Checksum(ctx context.Context, tableInfo *TidbTableInfo) (*RemoteChecksum, error) { + t.callCnt++ + return &t.checksum, nil +} - db, m, err := sqlmock.New() - c.Assert(err, IsNil) +var _ = Suite(&metaMgrSuite{}) + +type metaMgrSuite struct { + dbHandle *sql.DB + mockDB sqlmock.Sqlmock + tr *TableRestore + mgr *tableMetaMgr + checksumMgr *testChecksumMgr +} + +func (s *metaMgrSuite) SetUpSuite(c *C) { + p := parser.New() + se := tmock.NewContext() node, err := p.ParseOneStmt("CREATE TABLE `t1` (`c1` varchar(5) NOT NULL)", "utf8mb4", "utf8mb4_bin") c.Assert(err, IsNil) @@ -1335,36 +1352,191 @@ func (s *restoreSuite) TestAllocTableRowIDs(c *C) { tableName := common.UniqueTable(schema, tb) logger := log.With(zap.String("table", tableName)) - tr := &TableRestore{ + s.tr = &TableRestore{ tableName: tableName, tableInfo: ti, logger: logger, } +} + +func (s *metaMgrSuite) SetUpTest(c *C) { + db, m, err := sqlmock.New() + c.Assert(err, IsNil) - mgr := &tableMetaMgr{ + s.mgr = &tableMetaMgr{ session: db, taskID: 1, - tr: tr, + tr: s.tr, } + s.mockDB = m + s.checksumMgr = &testChecksumMgr{} +} - m.ExpectExec("SET SESSION tidb_txn_mode = 'pessimistic';"). - WillReturnResult(sqlmock.NewResult(int64(0), int64(0))) +func (s *metaMgrSuite) TearDownTest(c *C) { + c.Assert(s.mockDB.ExpectationsWereMet(), IsNil) +} - m.ExpectBegin() - m.ExpectQuery("\\QSELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from mysql.brie_sub_tasks WHERE table_id = ? FOR UPDATE\\E"). - WithArgs(int64(1)). - WillReturnRows(sqlmock.NewRows([]string{"task_id", "row_id_base", "row_id_max", "total_kvs_base", "total_bytes_base", "checksum_base", "status"}). - AddRow("1", int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized")) - m.ExpectQuery("SHOW TABLE `test`.`t1` NEXT_ROW_ID"). - WillReturnRows(sqlmock.NewRows([]string{"DB_NAME", "TABLE_NAME", "COLUMN_NAME", "NEXT_GLOBAL_ROW_ID", "ID_TYPE"}). - AddRow("test", "t1", "_tidb_rowid", int64(1), "AUTO_INCREMENT")) - m.ExpectExec("\\Qupdate mysql.brie_sub_tasks set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?\\E"). - WithArgs(int64(0), int64(10), "restore", int64(1), int64(1)). - WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) - m.ExpectCommit() - ck, rowIDBase, err := mgr.AllocTableRowIDs(ctx, tr, 10) +func (s *metaMgrSuite) TestAllocTableRowIDsSingleTable(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, + } + nextID := int64(1) + updateArgs := []driver.Value{int64(0), int64(10), "restore", int64(1), int64(1)} + s.prepareMock(rows, &nextID, updateArgs, nil, nil) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, s.tr, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(0)) + c.Assert(ck, IsNil) + c.Assert(s.checksumMgr.callCnt, Equals, 0) +} + +func (s *metaMgrSuite) TestAllocTableRowIDsSingleTableAutoIDNot0(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, + } + nextID := int64(999) + updateArgs := []driver.Value{int64(998), int64(1008), "allocated", int64(1), int64(1)} + newStatus := "restore" + s.prepareMock(rows, &nextID, updateArgs, nil, &newStatus) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, s.tr, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(998)) + c.Assert(ck, IsNil) + c.Assert(s.checksumMgr.callCnt, Equals, 1) +} + +func (s *metaMgrSuite) TestAllocTableRowIDsSingleTableContainsData(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, + } + nextID := int64(999) + checksum := verification.MakeKVChecksum(1, 2, 3) + updateArgs := []driver.Value{int64(998), int64(1008), "allocated", int64(1), int64(1)} + s.prepareMock(rows, &nextID, updateArgs, &checksum, nil) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, s.tr, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(998)) + c.Assert(ck, DeepEquals, &checksum) + c.Assert(s.checksumMgr.callCnt, Equals, 1) +} + +func (s *metaMgrSuite) TestAllocTableRowIDsAllocated(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(998), int64(1008), uint64(0), uint64(0), uint64(0), metaStatusRowIDAllocated.String()}, + } + checksum := verification.MakeKVChecksum(2, 1, 3) + s.prepareMock(rows, nil, nil, &checksum, nil) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, s.tr, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(998)) + c.Assert(ck, DeepEquals, &checksum) + c.Assert(s.checksumMgr.callCnt, Equals, 1) +} + +func (s *metaMgrSuite) TestAllocTableRowIDsFinished(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(998), int64(1008), uint64(1), uint64(2), uint64(3), metaStatusRestoreStarted.String()}, + } + checksum := verification.MakeKVChecksum(2, 1, 3) + s.prepareMock(rows, nil, nil, nil, nil) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, s.tr, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(998)) + c.Assert(ck, DeepEquals, &checksum) + c.Assert(s.checksumMgr.callCnt, Equals, 0) +} + +func (s *metaMgrSuite) TestAllocTableRowIDsMultiTasksInit(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, + {int64(2), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, + } + nextID := int64(1) + updateArgs := []driver.Value{int64(0), int64(10), "restore", int64(1), int64(1)} + s.prepareMock(rows, &nextID, updateArgs, nil, nil) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, s.tr, 10) c.Assert(err, IsNil) c.Assert(rowIDBase, Equals, int64(0)) c.Assert(ck, IsNil) + c.Assert(s.checksumMgr.callCnt, Equals, 0) +} +func (s *metaMgrSuite) TestAllocTableRowIDsMultiTasksAllocated(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), metaStatusInitial.String()}, + {int64(2), int64(0), int64(100), uint64(0), uint64(0), uint64(0), metaStatusRowIDAllocated.String()}, + } + updateArgs := []driver.Value{int64(100), int64(110), "restore", int64(1), int64(1)} + s.prepareMock(rows, nil, updateArgs, nil, nil) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, s.tr, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(100)) + c.Assert(ck, IsNil) + c.Assert(s.checksumMgr.callCnt, Equals, 0) +} + +func (s *metaMgrSuite) prepareMock(rowsVal [][]driver.Value, nextRowID *int64, updateArgs []driver.Value, checksum *verification.KVChecksum, updateStatus *string) { + s.mockDB.ExpectExec("SET SESSION tidb_txn_mode = 'pessimistic';"). + WillReturnResult(sqlmock.NewResult(int64(0), int64(0))) + + s.mockDB.ExpectBegin() + + rows := sqlmock.NewRows([]string{"task_id", "row_id_base", "row_id_max", "total_kvs_base", "total_bytes_base", "checksum_base", "status"}) + for _, r := range rowsVal { + rows = rows.AddRow(r...) + } + s.mockDB.ExpectQuery("\\QSELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from mysql.brie_sub_tasks WHERE table_id = ? FOR UPDATE\\E"). + WithArgs(int64(1)). + WillReturnRows(rows) + if nextRowID != nil { + s.mockDB.ExpectQuery("SHOW TABLE `test`.`t1` NEXT_ROW_ID"). + WillReturnRows(sqlmock.NewRows([]string{"DB_NAME", "TABLE_NAME", "COLUMN_NAME", "NEXT_GLOBAL_ROW_ID", "ID_TYPE"}). + AddRow("test", "t1", "_tidb_rowid", *nextRowID, "AUTO_INCREMENT")) + } + + if len(updateArgs) > 0 { + s.mockDB.ExpectExec("\\Qupdate mysql.brie_sub_tasks set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?\\E"). + WithArgs(updateArgs...). + WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) + } + + s.mockDB.ExpectCommit() + + if checksum != nil { + s.mockDB.ExpectExec("\\Qupdate mysql.brie_sub_tasks set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?\\E"). + WithArgs(checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), int64(1), int64(1)). + WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) + s.checksumMgr.checksum = RemoteChecksum{ + TotalBytes: checksum.SumSize(), + TotalKVs: checksum.SumKVS(), + Checksum: checksum.Sum(), + } + } + + if updateStatus != nil { + s.mockDB.ExpectExec("\\Qupdate mysql.brie_sub_tasks set status = ? where table_id = ? and task_id = ?\\E"). + WithArgs(*updateStatus, int64(1), int64(1)). + WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) + } } From e2b0a91f951bf6360e14ecc321b7757421ccd231 Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 17 Mar 2021 15:40:35 +0800 Subject: [PATCH 15/32] fix integration test --- pkg/lightning/restore/restore.go | 4 ---- tests/lightning_error_summary/data/error_summary.c.sql | 2 +- tests/lightning_local_backend/run.sh | 4 ++-- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 156cb8134..b4f66a9f8 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -1463,10 +1463,6 @@ func (t *TableRestore) restoreEngines(pCtx context.Context, rc *RestoreControlle return } - failpoint.Inject("FailBeforeDataEngineImported", func() { - panic("forcing failure due to FailBeforeDataEngineImported") - }) - dataWorker := rc.closedEngineLimit.Apply() defer rc.closedEngineLimit.Recycle(dataWorker) if err := t.importEngine(ctx, dataClosedEngine, rc, eid, ecp); err != nil { diff --git a/tests/lightning_error_summary/data/error_summary.c.sql b/tests/lightning_error_summary/data/error_summary.c.sql index be11c04ab..4ed9e54a4 100644 --- a/tests/lightning_error_summary/data/error_summary.c.sql +++ b/tests/lightning_error_summary/data/error_summary.c.sql @@ -1 +1 @@ -INSERT INTO c VALUES (10, 100), (1000, 10000); +INSERT INTO c VALUES (3, 100), (1000, 10000); diff --git a/tests/lightning_local_backend/run.sh b/tests/lightning_local_backend/run.sh index 4e7b38b19..bde495e12 100755 --- a/tests/lightning_local_backend/run.sh +++ b/tests/lightning_local_backend/run.sh @@ -58,7 +58,7 @@ run_sql 'DROP DATABASE cpeng;' rm -f "/tmp/tidb_lightning_checkpoint_local_backend_test.pb" set +e -export GO_FAILPOINTS='github.com/pingcap/br/pkg/lightning/restore/FailBeforeDataEngineImported=return' +export GO_FAILPOINTS='github.com/pingcap/br/pkg/lightning/restore/FailIfStatusBecomes=return(90);' for i in $(seq "$ENGINE_COUNT"); do echo "******** Importing Table Now (step $i/$ENGINE_COUNT) ********" run_lightning --backend local --enable-checkpoint=1 --log-file "$TEST_DIR/lightning-local.log" --config "tests/$TEST_NAME/config.toml" @@ -120,4 +120,4 @@ for ckpt in mysql file; do --config=tests/$TEST_NAME/$ckpt.toml >$TEST_DIR/lightning_ctl.output 2>&1 grep -Fq "No table has lost intermediate files according to given config" $TEST_DIR/lightning_ctl.output done -rm -r $TEST_DIR/$TEST_NAME.sorted \ No newline at end of file +rm -r $TEST_DIR/$TEST_NAME.sorted From e0b5d049311876be42e35c0f22f920f8f8c2e61b Mon Sep 17 00:00:00 2001 From: glorv Date: Thu, 18 Mar 2021 10:30:02 +0800 Subject: [PATCH 16/32] fix import name change --- pkg/lightning/restore/restore.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 391ff6d2f..c6cdbfd59 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -2881,7 +2881,7 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, tr *TableRestore, r return errors.Errorf("table %s contains auto increment id or _tidb_rowid, but target field not found", tr.tableName) } - autoIDInfos, err := kv.FetchTableAutoIDInfos(ctx, tx, m.tr.tableName) + autoIDInfos, err := tidb.FetchTableAutoIDInfos(ctx, tx, m.tr.tableName) if err != nil { return errors.Trace(err) } From 458b299775246e5f25ffa907e1fefb74092a5e44 Mon Sep 17 00:00:00 2001 From: glorv Date: Thu, 18 Mar 2021 11:24:54 +0800 Subject: [PATCH 17/32] fix auto_random primary key --- tests/lightning_incremental/data/incr.auto_random-schema.sql | 2 +- tests/lightning_incremental/data1/incr.auto_random-schema.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/lightning_incremental/data/incr.auto_random-schema.sql b/tests/lightning_incremental/data/incr.auto_random-schema.sql index 712c45921..028c7c9d9 100644 --- a/tests/lightning_incremental/data/incr.auto_random-schema.sql +++ b/tests/lightning_incremental/data/incr.auto_random-schema.sql @@ -1,5 +1,5 @@ /*!40103 SET TIME_ZONE='+00:00' */; CREATE TABLE `auto_random` ( - `id` bigint primary key auto_random, + `id` bigint primary key clustered auto_random, v varchar(255) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; diff --git a/tests/lightning_incremental/data1/incr.auto_random-schema.sql b/tests/lightning_incremental/data1/incr.auto_random-schema.sql index 712c45921..028c7c9d9 100644 --- a/tests/lightning_incremental/data1/incr.auto_random-schema.sql +++ b/tests/lightning_incremental/data1/incr.auto_random-schema.sql @@ -1,5 +1,5 @@ /*!40103 SET TIME_ZONE='+00:00' */; CREATE TABLE `auto_random` ( - `id` bigint primary key auto_random, + `id` bigint primary key clustered auto_random, v varchar(255) ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; From dae79a227355d8ece8655ce46e7a43718df22139 Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 20 Apr 2021 12:23:06 +0800 Subject: [PATCH 18/32] add taskmeta to sync schedulers and swith-mode between multi lightning --- pkg/lightning/config/config.go | 16 +- pkg/lightning/restore/restore.go | 633 ++++++++++++++++++++++++------- pkg/pdutil/pd.go | 78 +++- tidb-lightning.toml | 4 + 4 files changed, 570 insertions(+), 161 deletions(-) diff --git a/pkg/lightning/config/config.go b/pkg/lightning/config/config.go index 3803c05d6..af31ac997 100644 --- a/pkg/lightning/config/config.go +++ b/pkg/lightning/config/config.go @@ -71,6 +71,8 @@ const ( defaultIndexSerialScanConcurrency = 20 defaultChecksumTableConcurrency = 2 + defaultMetaSchemaName = "lighting_metadata" + // autoDiskQuotaLocalReservedSpeed is the estimated size increase per // millisecond per write thread the local backend may gain on all engines. // This is used to compute the maximum size overshoot between two disk quota @@ -148,11 +150,12 @@ func (cfg *Config) ToTLS() (*common.TLS, error) { } type Lightning struct { - TableConcurrency int `toml:"table-concurrency" json:"table-concurrency"` - IndexConcurrency int `toml:"index-concurrency" json:"index-concurrency"` - RegionConcurrency int `toml:"region-concurrency" json:"region-concurrency"` - IOConcurrency int `toml:"io-concurrency" json:"io-concurrency"` - CheckRequirements bool `toml:"check-requirements" json:"check-requirements"` + TableConcurrency int `toml:"table-concurrency" json:"table-concurrency"` + IndexConcurrency int `toml:"index-concurrency" json:"index-concurrency"` + RegionConcurrency int `toml:"region-concurrency" json:"region-concurrency"` + IOConcurrency int `toml:"io-concurrency" json:"io-concurrency"` + CheckRequirements bool `toml:"check-requirements" json:"check-requirements"` + MetaSchemaName string `toml:"meta-schema-name" json:"meta-schema-name"` } type PostOpLevel int @@ -656,6 +659,9 @@ func (cfg *Config) DefaultVarsForImporterAndLocalBackend() { if cfg.App.TableConcurrency == 0 { cfg.App.TableConcurrency = 6 } + if len(cfg.App.MetaSchemaName) == 0 { + cfg.App.MetaSchemaName = defaultMetaSchemaName + } if cfg.TikvImporter.RangeConcurrency == 0 { cfg.TikvImporter.RangeConcurrency = 16 } diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 22b258a25..908f28836 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -16,6 +16,7 @@ package restore import ( "context" "database/sql" + "encoding/json" "fmt" "io" "math" @@ -83,8 +84,10 @@ const ( ) const ( - // CreateBRIESubJobTable stores the per-table sub jobs information used by TiDB Lightning - CreateBRIESubJobTable = `CREATE TABLE IF NOT EXISTS mysql.brie_sub_tasks ( + taskMetaTableName = "task_meta" + tableMetaTableName = "table_meta" + // CreateTableMetadataTable stores the per-table sub jobs information used by TiDB Lightning + CreateTableMetadataTable = `CREATE TABLE IF NOT EXISTS %s.%s ( task_id BIGINT(20) UNSIGNED, table_id BIGINT(64) NOT NULL, table_name VARCHAR(64) NOT NULL, @@ -99,6 +102,13 @@ const ( status VARCHAR(32) NOT NULL, PRIMARY KEY (table_id, task_id) );` + // CreateTaskMetaTable stores the pre-lightning metadata used by TiDB Lightning + CreateTaskMetaTable = `CREATE TABLE IF NOT EXISTS %s.%s ( + task_id BIGINT(20) UNSIGNED NOT NULL, + pd_cfgs VARCHAR(2048) NOT NULL DEFAULT '', + status VARCHAR(32) NOT NULL, + PRIMARY KEY (task_id) + );` ) // DeliverPauser is a shared pauser to pause progress to (*chunkRestore).encodeLoop @@ -694,8 +704,21 @@ func (rc *Controller) restoreSchema(ctx context.Context) error { // TODO: maybe we should not create this table here since user may not have write permission to the `mysql` db. // ensure meta table exists - if err := rc.tidbGlue.GetSQLExecutor().ExecuteWithLog(ctx, CreateBRIESubJobTable, "create meta table", log.L()); err != nil { - return errors.Annotate(err, "create meta table failed") + if rc.cfg.TikvImporter.Backend != config.BackendTiDB { + exec := rc.tidbGlue.GetSQLExecutor() + logger := log.L() + metaDBSQL := fmt.Sprintf("CREATE DATABASE IF NOT EXISTS `%s`", rc.cfg.App.MetaSchemaName) + if err := exec.ExecuteWithLog(ctx, metaDBSQL, "create meta schema", logger); err != nil { + return errors.Annotate(err, "create meta schema failed") + } + taskMetaSQL := fmt.Sprintf(CreateTaskMetaTable, rc.cfg.App.MetaSchemaName, taskMetaTableName) + if err := exec.ExecuteWithLog(ctx, taskMetaSQL, "create meta table", log.L()); err != nil { + return errors.Annotate(err, "create task meta table failed") + } + tableMetaSQL := fmt.Sprintf(CreateTableMetadataTable, rc.cfg.App.MetaSchemaName, tableMetaTableName) + if err := exec.ExecuteWithLog(ctx, tableMetaSQL, "create meta table", log.L()); err != nil { + return errors.Annotate(err, "create table meta table failed") + } } // Estimate the number of chunks for progress reporting @@ -945,7 +968,10 @@ func (rc *Controller) listenCheckpointUpdates() { rc.checkpointsWg.Done() } -func (rc *Controller) runPeriodicActions(ctx context.Context, stop <-chan struct{}) { +// buildRunPeriodicActionAndCancelFunc build the runPeriodicAction func and a cancel func +func (rc *Controller) buildRunPeriodicActionAndCancelFunc(ctx context.Context, stop <-chan struct{}) (func(), func(bool)) { + cancelFuncs := make([]func(bool), 0) + // a nil channel blocks forever. // if the cron duration is zero we use the nil channel to skip the action. var logProgressChan <-chan time.Time @@ -962,127 +988,143 @@ func (rc *Controller) runPeriodicActions(ctx context.Context, stop <-chan struct // tidb backend don't need to switch tikv to import mode if rc.cfg.TikvImporter.Backend != config.BackendTiDB && rc.cfg.Cron.SwitchMode.Duration > 0 { switchModeTicker := time.NewTicker(rc.cfg.Cron.SwitchMode.Duration) - defer switchModeTicker.Stop() - switchModeChan = switchModeTicker.C + cancelFuncs = append(cancelFuncs, func(bool) { switchModeTicker.Stop() }) + cancelFuncs = append(cancelFuncs, func(do bool) { + if do { + if err := rc.switchToNormalMode(ctx); err != nil { + log.L().Warn("switch tikv to normal mode failed", zap.Error(err)) + } + } - rc.switchToImportMode(ctx) + }) + switchModeChan = switchModeTicker.C } var checkQuotaChan <-chan time.Time // only local storage has disk quota concern. if rc.cfg.TikvImporter.Backend == config.BackendLocal && rc.cfg.Cron.CheckDiskQuota.Duration > 0 { checkQuotaTicker := time.NewTicker(rc.cfg.Cron.CheckDiskQuota.Duration) - defer checkQuotaTicker.Stop() + cancelFuncs = append(cancelFuncs, func(bool) { checkQuotaTicker.Stop() }) checkQuotaChan = checkQuotaTicker.C } - start := time.Now() - for { - select { - case <-ctx.Done(): - log.L().Warn("stopping periodic actions", log.ShortError(ctx.Err())) - return - case <-stop: - log.L().Info("everything imported, stopping periodic actions") - return - - case <-switchModeChan: - // periodically switch to import mode, as requested by TiKV 3.0 - rc.switchToImportMode(ctx) - - case <-logProgressChan: - // log the current progress periodically, so OPS will know that we're still working - nanoseconds := float64(time.Since(start).Nanoseconds()) - // the estimated chunk is not accurate(likely under estimated), but the actual count is not accurate - // before the last table start, so use the bigger of the two should be a workaround - estimated := metric.ReadCounter(metric.ChunkCounter.WithLabelValues(metric.ChunkStateEstimated)) - pending := metric.ReadCounter(metric.ChunkCounter.WithLabelValues(metric.ChunkStatePending)) - if estimated < pending { - estimated = pending - } - finished := metric.ReadCounter(metric.ChunkCounter.WithLabelValues(metric.ChunkStateFinished)) - totalTables := metric.ReadCounter(metric.TableCounter.WithLabelValues(metric.TableStatePending, metric.TableResultSuccess)) - completedTables := metric.ReadCounter(metric.TableCounter.WithLabelValues(metric.TableStateCompleted, metric.TableResultSuccess)) - bytesRead := metric.ReadHistogramSum(metric.RowReadBytesHistogram) - engineEstimated := metric.ReadCounter(metric.ProcessedEngineCounter.WithLabelValues(metric.ChunkStateEstimated, metric.TableResultSuccess)) - enginePending := metric.ReadCounter(metric.ProcessedEngineCounter.WithLabelValues(metric.ChunkStatePending, metric.TableResultSuccess)) - if engineEstimated < enginePending { - engineEstimated = enginePending - } - engineFinished := metric.ReadCounter(metric.ProcessedEngineCounter.WithLabelValues(metric.TableStateImported, metric.TableResultSuccess)) - bytesWritten := metric.ReadCounter(metric.BytesCounter.WithLabelValues(metric.TableStateWritten)) - bytesImported := metric.ReadCounter(metric.BytesCounter.WithLabelValues(metric.TableStateImported)) - - var state string - var remaining zap.Field - switch { - case finished >= estimated: - if engineFinished < engineEstimated { - state = "importing" - } else { - state = "post-processing" - } - case finished > 0: - state = "writing" - default: - state = "preparing" - } - - // since we can't accurately estimate the extra time cost by import after all writing are finished, - // so here we use estimatedWritingProgress * 0.8 + estimatedImportingProgress * 0.2 as the total - // progress. - remaining = zap.Skip() - totalPercent := 0.0 - if finished > 0 { - writePercent := math.Min(finished/estimated, 1.0) - importPercent := 1.0 - if bytesWritten > 0 { - totalBytes := bytesWritten / writePercent - importPercent = math.Min(bytesImported/totalBytes, 1.0) - } - totalPercent = writePercent*0.8 + importPercent*0.2 - if totalPercent < 1.0 { - remainNanoseconds := (1.0 - totalPercent) / totalPercent * nanoseconds - remaining = zap.Duration("remaining", time.Duration(remainNanoseconds).Round(time.Second)) - } + return func() { + // tidb backend don't need to switch tikv to import mode + if rc.cfg.TikvImporter.Backend != config.BackendTiDB && rc.cfg.Cron.SwitchMode.Duration > 0 { + rc.switchToImportMode(ctx) } + start := time.Now() + for { + select { + case <-ctx.Done(): + log.L().Warn("stopping periodic actions", log.ShortError(ctx.Err())) + return + case <-stop: + log.L().Info("everything imported, stopping periodic actions") + return - formatPercent := func(finish, estimate float64) string { - speed := "" - if estimated > 0 { - speed = fmt.Sprintf(" (%.1f%%)", finish/estimate*100) - } - return speed - } + case <-switchModeChan: + // periodically switch to import mode, as requested by TiKV 3.0 + rc.switchToImportMode(ctx) + + case <-logProgressChan: + // log the current progress periodically, so OPS will know that we're still working + nanoseconds := float64(time.Since(start).Nanoseconds()) + // the estimated chunk is not accurate(likely under estimated), but the actual count is not accurate + // before the last table start, so use the bigger of the two should be a workaround + estimated := metric.ReadCounter(metric.ChunkCounter.WithLabelValues(metric.ChunkStateEstimated)) + pending := metric.ReadCounter(metric.ChunkCounter.WithLabelValues(metric.ChunkStatePending)) + if estimated < pending { + estimated = pending + } + finished := metric.ReadCounter(metric.ChunkCounter.WithLabelValues(metric.ChunkStateFinished)) + totalTables := metric.ReadCounter(metric.TableCounter.WithLabelValues(metric.TableStatePending, metric.TableResultSuccess)) + completedTables := metric.ReadCounter(metric.TableCounter.WithLabelValues(metric.TableStateCompleted, metric.TableResultSuccess)) + bytesRead := metric.ReadHistogramSum(metric.RowReadBytesHistogram) + engineEstimated := metric.ReadCounter(metric.ProcessedEngineCounter.WithLabelValues(metric.ChunkStateEstimated, metric.TableResultSuccess)) + enginePending := metric.ReadCounter(metric.ProcessedEngineCounter.WithLabelValues(metric.ChunkStatePending, metric.TableResultSuccess)) + if engineEstimated < enginePending { + engineEstimated = enginePending + } + engineFinished := metric.ReadCounter(metric.ProcessedEngineCounter.WithLabelValues(metric.TableStateImported, metric.TableResultSuccess)) + bytesWritten := metric.ReadCounter(metric.BytesCounter.WithLabelValues(metric.TableStateWritten)) + bytesImported := metric.ReadCounter(metric.BytesCounter.WithLabelValues(metric.TableStateImported)) - // avoid output bytes speed if there are no unfinished chunks - chunkSpeed := zap.Skip() - if bytesRead > 0 { - chunkSpeed = zap.Float64("speed(MiB/s)", bytesRead/(1048576e-9*nanoseconds)) - } + var state string + var remaining zap.Field + switch { + case finished >= estimated: + if engineFinished < engineEstimated { + state = "importing" + } else { + state = "post-processing" + } + case finished > 0: + state = "writing" + default: + state = "preparing" + } - // Note: a speed of 28 MiB/s roughly corresponds to 100 GiB/hour. - log.L().Info("progress", - zap.String("total", fmt.Sprintf("%.1f%%", totalPercent*100)), - // zap.String("files", fmt.Sprintf("%.0f/%.0f (%.1f%%)", finished, estimated, finished/estimated*100)), - zap.String("tables", fmt.Sprintf("%.0f/%.0f%s", completedTables, totalTables, formatPercent(completedTables, totalTables))), - zap.String("chunks", fmt.Sprintf("%.0f/%.0f%s", finished, estimated, formatPercent(finished, estimated))), - zap.String("engines", fmt.Sprintf("%.f/%.f%s", engineFinished, engineEstimated, formatPercent(engineFinished, engineEstimated))), - chunkSpeed, - zap.String("state", state), - remaining, - ) + // since we can't accurately estimate the extra time cost by import after all writing are finished, + // so here we use estimatedWritingProgress * 0.8 + estimatedImportingProgress * 0.2 as the total + // progress. + remaining = zap.Skip() + totalPercent := 0.0 + if finished > 0 { + writePercent := math.Min(finished/estimated, 1.0) + importPercent := 1.0 + if bytesWritten > 0 { + totalBytes := bytesWritten / writePercent + importPercent = math.Min(bytesImported/totalBytes, 1.0) + } + totalPercent = writePercent*0.8 + importPercent*0.2 + if totalPercent < 1.0 { + remainNanoseconds := (1.0 - totalPercent) / totalPercent * nanoseconds + remaining = zap.Duration("remaining", time.Duration(remainNanoseconds).Round(time.Second)) + } + } - case <-checkQuotaChan: - // verify the total space occupied by sorted-kv-dir is below the quota, - // otherwise we perform an emergency import. - rc.enforceDiskQuota(ctx) + formatPercent := func(finish, estimate float64) string { + speed := "" + if estimated > 0 { + speed = fmt.Sprintf(" (%.1f%%)", finish/estimate*100) + } + return speed + } - case <-glueProgressTicker.C: - finished := metric.ReadCounter(metric.ChunkCounter.WithLabelValues(metric.ChunkStateFinished)) - rc.tidbGlue.Record(glue.RecordFinishedChunk, uint64(finished)) + // avoid output bytes speed if there are no unfinished chunks + chunkSpeed := zap.Skip() + if bytesRead > 0 { + chunkSpeed = zap.Float64("speed(MiB/s)", bytesRead/(1048576e-9*nanoseconds)) + } + + // Note: a speed of 28 MiB/s roughly corresponds to 100 GiB/hour. + log.L().Info("progress", + zap.String("total", fmt.Sprintf("%.1f%%", totalPercent*100)), + // zap.String("files", fmt.Sprintf("%.0f/%.0f (%.1f%%)", finished, estimated, finished/estimated*100)), + zap.String("tables", fmt.Sprintf("%.0f/%.0f%s", completedTables, totalTables, formatPercent(completedTables, totalTables))), + zap.String("chunks", fmt.Sprintf("%.0f/%.0f%s", finished, estimated, formatPercent(finished, estimated))), + zap.String("engines", fmt.Sprintf("%.f/%.f%s", engineFinished, engineEstimated, formatPercent(engineFinished, engineEstimated))), + chunkSpeed, + zap.String("state", state), + remaining, + ) + + case <-checkQuotaChan: + // verify the total space occupied by sorted-kv-dir is below the quota, + // otherwise we perform an emergency import. + rc.enforceDiskQuota(ctx) + + case <-glueProgressTicker.C: + finished := metric.ReadCounter(metric.ChunkCounter.WithLabelValues(metric.ChunkStateFinished)) + rc.tidbGlue.Record(glue.RecordFinishedChunk, uint64(finished)) + } + } + }, func(do bool) { + for _, f := range cancelFuncs { + f(do) + } } - } } var checksumManagerKey struct{} @@ -1094,6 +1136,10 @@ func (rc *Controller) restoreTables(ctx context.Context) error { // make split region and ingest sst more stable // because importer backend is mostly use for v3.x cluster which doesn't support these api, // so we also don't do this for import backend + finishSchedulers := func() {} + // if one lightning failed abnormally, and can't determine whether it needs to switch back, + // we do not do switch back automatically + switchBack := false if rc.cfg.TikvImporter.Backend == config.BackendLocal { // disable some pd schedulers pdController, err := pdutil.NewPdController(ctx, rc.cfg.TiDB.PdAddr, @@ -1101,17 +1147,52 @@ func (rc *Controller) restoreTables(ctx context.Context) error { if err != nil { return errors.Trace(err) } + + db, err := rc.tidbGlue.GetDB() + if err != nil { + return errors.Trace(err) + } + mgr := taskMetaMgr{ + pd: pdController, + taskID: rc.cfg.TaskID, + session: db, + schemaName: rc.cfg.App.MetaSchemaName, + tableName: common.UniqueTable(rc.cfg.App.MetaSchemaName, taskMetaTableName), + } + + if err = mgr.initTask(ctx); err != nil { + return err + } + logTask.Info("removing PD leader®ion schedulers") - restoreFn, e := pdController.RemoveSchedulers(ctx) - defer func() { - // use context.Background to make sure this restore function can still be executed even if ctx is canceled - if restoreE := restoreFn(context.Background()); restoreE != nil { - logTask.Warn("failed to restore removed schedulers, you may need to restore them manually", zap.Error(restoreE)) - return + + restoreFn, err := mgr.checkAndPausePdSchedulers(ctx) + finishSchedulers = func() { + if restoreFn != nil { + // use context.Background to make sure this restore function can still be executed even if ctx is canceled + restoreCtx := context.Background() + needSwitchBack, err := mgr.CheckAndFinishRestore(restoreCtx) + if err != nil { + logTask.Warn("check restore pd schedulers failed", zap.Error(err)) + return + } + switchBack = needSwitchBack + if needSwitchBack { + if restoreE := restoreFn(restoreCtx); restoreE != nil { + logTask.Warn("failed to restore removed schedulers, you may need to restore them manually", zap.Error(restoreE)) + } + if cleanupErr := mgr.cleanup(restoreCtx); cleanupErr != nil { + logTask.Warn("failed to clean task metas, you may need to restore them manually", zap.Error(cleanupErr)) + } + } + + logTask.Info("add back PD leader®ion schedulers") } - logTask.Info("add back PD leader®ion schedulers") - }() - if e != nil { + + pdController.Close() + } + + if err != nil { return errors.Trace(err) } } @@ -1131,7 +1212,18 @@ func (rc *Controller) restoreTables(ctx context.Context) error { var restoreErr common.OnceError stopPeriodicActions := make(chan struct{}) - go rc.runPeriodicActions(ctx, stopPeriodicActions) + + periodicActions, cancelFunc := rc.buildRunPeriodicActionAndCancelFunc(ctx, stopPeriodicActions) + go periodicActions() + finishFuncCalled := false + defer func() { + if !finishFuncCalled { + finishSchedulers() + cancelFunc(switchBack) + finishFuncCalled = true + } + }() + defer close(stopPeriodicActions) taskCh := make(chan task, rc.cfg.App.IndexConcurrency) @@ -1278,6 +1370,12 @@ func (rc *Controller) restoreTables(ctx context.Context) error { default: } + // stop periodic tasks for restore table such as pd schedulers and switch-mode tasks. + // this can help make cluster switching back to normal state more quickly. + finishSchedulers() + cancelFunc(switchBack) + finishFuncCalled = true + close(postProcessTaskChan) // TODO: support Lightning via SQL db, err := rc.tidbGlue.GetDB() @@ -2773,9 +2871,10 @@ func (cr *chunkRestore) restore( } type tableMetaMgr struct { - session *sql.DB - taskID int64 - tr *TableRestore + session *sql.DB + taskID int64 + tr *TableRestore + tableName string } func (m *tableMetaMgr) InitTableMeta(ctx context.Context) error { @@ -2784,9 +2883,9 @@ func (m *tableMetaMgr) InitTableMeta(ctx context.Context) error { Logger: m.tr.logger, } // avoid override existing metadata if the meta is already inserted. - stmt := `INSERT IGNORE INTO mysql.brie_sub_tasks (task_id, table_id, table_name, status) values (?, ?, ?, ?)` + stmt := `INSERT IGNORE INTO ? (task_id, table_id, table_name, status) values (?, ?, ?, ?)` task := m.tr.logger.Begin(zap.DebugLevel, "init table meta") - err := exec.Exec(ctx, "init table meta", stmt, m.taskID, m.tr.tableInfo.ID, m.tr.tableName, metaStatusInitial.String()) + err := exec.Exec(ctx, "init table meta", stmt, m.tableName, m.taskID, m.tr.tableInfo.ID, m.tr.tableName, metaStatusInitial.String()) task.End(zap.ErrorLevel, err) return errors.Trace(err) } @@ -2864,8 +2963,8 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) return nil, 0, errors.Annotate(err, "enable pessimistic transaction failed") } err = exec.Transact(ctx, "init table allocator base", func(ctx context.Context, tx *sql.Tx) error { - query := fmt.Sprintf("SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from mysql.brie_sub_tasks WHERE table_id = ? FOR UPDATE") - rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) + query := "SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from ? WHERE table_id = ? FOR UPDATE" + rows, err := tx.QueryContext(ctx, query, m.tableName, m.tr.tableInfo.ID) if err != nil { return errors.Trace(err) } @@ -2959,8 +3058,8 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) if newRowIDBase == 0 && newStatus < metaStatusRestoreStarted { newStatus = metaStatusRestoreStarted } - query = "update mysql.brie_sub_tasks set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?" - _, err := tx.ExecContext(ctx, query, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, m.taskID) + query = "update ? set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?" + _, err := tx.ExecContext(ctx, query, m.tableName, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, m.taskID) if err != nil { return errors.Trace(err) } @@ -3012,9 +3111,9 @@ func (m *tableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *ve DB: m.session, Logger: m.tr.logger, } - query := "update mysql.brie_sub_tasks set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?" + query := "update ? set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?" - return exec.Exec(ctx, "update base checksum", query, checksum.SumKVS(), + return exec.Exec(ctx, "update base checksum", query, m.tableName, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), m.tr.tableInfo.ID, m.taskID) } @@ -3023,8 +3122,8 @@ func (m *tableMetaMgr) updateTableStatus(ctx context.Context, status metaStatus) DB: m.session, Logger: m.tr.logger, } - query := "update mysql.brie_sub_tasks set status = ? where table_id = ? and task_id = ?" - return exec.Exec(ctx, "update meta status", query, status.String(), m.tr.tableInfo.ID, m.taskID) + query := "update ? set status = ? where table_id = ? and task_id = ?" + return exec.Exec(ctx, "update meta status", query, m.tableName, status.String(), m.tr.tableInfo.ID, m.taskID) } func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { @@ -3049,8 +3148,8 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum newStatus := metaStatusChecksuming needChecksum := true err = exec.Transact(ctx, "checksum pre-check", func(ctx context.Context, tx *sql.Tx) error { - query := fmt.Sprintf("SELECT task_id, total_kvs_base, total_bytes_base, checksum_base, total_kvs, total_bytes, checksum, status from mysql.brie_sub_tasks WHERE table_id = ? FOR UPDATE") - rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) + query := "SELECT task_id, total_kvs_base, total_bytes_base, checksum_base, total_kvs, total_bytes, checksum, status from ? WHERE table_id = ? FOR UPDATE" + rows, err := tx.QueryContext(ctx, query, m.tableName, m.tr.tableInfo.ID) if err != nil { return errors.Annotate(err, "fetch task meta failed") } @@ -3107,8 +3206,8 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum rows.Close() closed = true - query = "update mysql.brie_sub_tasks set total_kvs = ?, total_bytes = ?, checksum = ?, status = ? where table_id = ? and task_id = ?" - _, err = tx.ExecContext(ctx, query, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus.String(), m.tr.tableInfo.ID, m.taskID) + query = "update ? set total_kvs = ?, total_bytes = ?, checksum = ?, status = ? where table_id = ? and task_id = ?" + _, err = tx.ExecContext(ctx, query, m.tableName, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus.String(), m.tr.tableInfo.ID, m.taskID) return errors.Annotate(err, "update local checksum failed") }) if err != nil { @@ -3128,6 +3227,264 @@ func (m *tableMetaMgr) FinishTable(ctx context.Context) error { DB: m.session, Logger: m.tr.logger, } - query := "DELETE FROM mysql.brie_sub_tasks where table_id = ? and (status = 'checksuming' or status = 'checksum_skipped')" - return exec.Exec(ctx, "clean up metas", query, m.tr.tableInfo.ID) + query := "DELETE FROM ? where table_id = ? and (status = 'checksuming' or status = 'checksum_skipped')" + return exec.Exec(ctx, "clean up metas", query, m.tableName, m.tr.tableInfo.ID) +} + +type taskMetaMgr struct { + session *sql.DB + taskID int64 + pd *pdutil.PdController + schemaName string + // unique name of task meta table + tableName string +} + +func (m *taskMetaMgr) InitTaskMeta(ctx context.Context) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: log.L(), + } + // avoid override existing metadata if the meta is already inserted. + stmt := `INSERT IGNORE INTO ? (task_id, status) values (?, ?)` + err := exec.Exec(ctx, "init task meta", stmt, m.tableName, m.taskID, metaStatusInitial.String()) + return errors.Trace(err) +} + +type taskMetaStatus uint32 + +const ( + taskMetaStatusInitial taskMetaStatus = iota + taskMetaStatusScheduleSet + taskMetaStatusSwitchSkipped + taskMetaStatusSwitchBack +) + +func (m taskMetaStatus) String() string { + switch m { + case taskMetaStatusInitial: + return "initialized" + case taskMetaStatusScheduleSet: + return "schedule_set" + case taskMetaStatusSwitchSkipped: + return "skip_switch" + case taskMetaStatusSwitchBack: + return "switched" + default: + panic(fmt.Sprintf("unexpected metaStatus value '%d'", m)) + } +} + +func parseTaskMetaStatus(s string) (taskMetaStatus, error) { + switch s { + case "", "initialized": + return taskMetaStatusInitial, nil + case "schedule_set": + return taskMetaStatusScheduleSet, nil + case "skip_switch": + return taskMetaStatusSwitchSkipped, nil + case "switched": + return taskMetaStatusSwitchBack, nil + default: + return taskMetaStatusInitial, errors.Errorf("invalid meta status '%s'", s) + } +} + +type storedCfgs struct { + PauseCfg pdutil.ClusterConfig `json:"paused"` + RestoreCFg pdutil.ClusterConfig `json:"restore"` +} + +func (m *taskMetaMgr) initTask(ctx context.Context) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: log.L(), + } + // avoid override existing metadata if the meta is already inserted. + stmt := `INSERT IGNORE INTO ? (task_id, status) values (?, ?)` + err := exec.Exec(ctx, "init task meta", stmt, m.tableName, m.taskID, taskMetaStatusInitial.String()) + return errors.Trace(err) +} + +func (m *taskMetaMgr) checkAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { + conn, err := m.session.Conn(ctx) + if err != nil { + return nil, errors.Trace(err) + } + defer conn.Close() + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: log.L(), + } + err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") + if err != nil { + return nil, errors.Annotate(err, "enable pessimistic transaction failed") + } + + needSwitch := true + paused := false + var pausedCfg storedCfgs + err = exec.Transact(ctx, "check and pause schedulers", func(ctx context.Context, tx *sql.Tx) error { + query := "SELECT task_id, pd_cfgs, status from ? FOR UPDATE" + rows, err := tx.QueryContext(ctx, query, m.tableName) + if err != nil { + return errors.Annotate(err, "fetch task meta failed") + } + closed := false + defer func() { + if !closed { + rows.Close() + } + }() + var ( + taskID int64 + cfg string + statusValue string + ) + var cfgStr string + for rows.Next() { + if err = rows.Scan(&taskID, &cfg, &statusValue); err != nil { + return errors.Trace(err) + } + status, err := parseTaskMetaStatus(statusValue) + if err != nil { + return errors.Annotatef(err, "invalid task meta status '%s'", statusValue) + } + + if status == taskMetaStatusInitial { + continue + } + + if taskID == m.taskID { + if status >= taskMetaStatusSwitchSkipped { + needSwitch = false + return nil + } + } + + if cfg != "" { + cfgStr = cfg + break + } + } + if err = rows.Close(); err != nil { + return errors.Trace(err) + } + closed = true + + if cfgStr != "" { + err = json.Unmarshal([]byte(cfgStr), &pausedCfg) + return errors.Trace(err) + } + + orig, removed, err := m.pd.RemoveSchedulersAndReturn(ctx) + if err != nil { + return errors.Trace(err) + } + paused = true + + pausedCfg = storedCfgs{PauseCfg: removed, RestoreCFg: orig} + jsonByts, err := json.Marshal(&pausedCfg) + if err != nil { + return errors.Trace(err) + } + + query = "update ? set pd_cfgs = ?, status = ? where task_id = ?" + _, err = tx.ExecContext(ctx, query, m.tableName, string(jsonByts), taskMetaStatusScheduleSet.String(), m.taskID) + + return errors.Annotate(err, "update task pd configs failed") + }) + if err != nil { + return nil, err + } + + if !needSwitch { + return nil, nil + } + + if !paused { + if err = m.pd.RemoveSchedulersWithCfg(ctx, pausedCfg.PauseCfg); err != nil { + return nil, err + } + } + + return m.pd.MakeUndoFunctionByConfig(pausedCfg.RestoreCFg), nil +} + +func (m *taskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { + conn, err := m.session.Conn(ctx) + if err != nil { + return false, errors.Trace(err) + } + defer conn.Close() + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: log.L(), + } + err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") + if err != nil { + return false, errors.Annotate(err, "enable pessimistic transaction failed") + } + + switchBack := true + err = exec.Transact(ctx, "check and finish schedulers", func(ctx context.Context, tx *sql.Tx) error { + query := "SELECT task_id, status from ? FOR UPDATE" + rows, err := tx.QueryContext(ctx, query, m.tableName) + if err != nil { + return errors.Annotate(err, "fetch task meta failed") + } + closed := false + defer func() { + if !closed { + rows.Close() + } + }() + var ( + taskID int64 + statusValue string + ) + newStatus := taskMetaStatusSwitchBack + for rows.Next() { + if err = rows.Scan(&taskID, &statusValue); err != nil { + return errors.Trace(err) + } + status, err := parseTaskMetaStatus(statusValue) + if err != nil { + return errors.Annotatef(err, "invalid task meta status '%s'", statusValue) + } + + if taskID == m.taskID { + continue + } + + if status < taskMetaStatusSwitchSkipped { + newStatus = taskMetaStatusSwitchSkipped + switchBack = false + break + } + } + if err = rows.Close(); err != nil { + return errors.Trace(err) + } + closed = true + + query = "update ? set status = ? where task_id = ?" + _, err = tx.ExecContext(ctx, query, m.tableName, newStatus.String(), m.taskID) + + return errors.Trace(err) + }) + + return switchBack, err +} + +func (m *taskMetaMgr) cleanup(ctx context.Context) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: log.L(), + } + // avoid override existing metadata if the meta is already inserted. + if err := exec.Exec(ctx, "cleanup task meta tables", "DROP DATABASE ?;", m.schemaName); err != nil { + return errors.Trace(err) + } + return nil } diff --git a/pkg/pdutil/pd.go b/pkg/pdutil/pd.go index 2809b5b96..d0d79859e 100644 --- a/pkg/pdutil/pd.go +++ b/pkg/pdutil/pd.go @@ -72,13 +72,13 @@ func constConfigGeneratorBuilder(val interface{}) pauseConfigGenerator { } } -// clusterConfig represents a set of scheduler whose config have been modified +// ClusterConfig represents a set of scheduler whose config have been modified // along with their original config. -type clusterConfig struct { +type ClusterConfig struct { // Enable PD schedulers before restore - scheduler []string + Schedulers []string `json:"schedulers"` // Original scheudle configuration - scheduleCfg map[string]interface{} + ScheduleCfg map[string]interface{} `json:"schedule_cfg"` } type pauseSchedulerBody struct { @@ -511,14 +511,14 @@ func (p *PdController) doPauseConfigs(ctx context.Context, cfg map[string]interf return p.doUpdatePDScheduleConfig(ctx, cfg, post, prefix) } -func restoreSchedulers(ctx context.Context, pd *PdController, clusterCfg clusterConfig) error { - if err := pd.ResumeSchedulers(ctx, clusterCfg.scheduler); err != nil { +func restoreSchedulers(ctx context.Context, pd *PdController, clusterCfg ClusterConfig) error { + if err := pd.ResumeSchedulers(ctx, clusterCfg.Schedulers); err != nil { return errors.Annotate(err, "fail to add PD schedulers") } - log.Info("restoring config", zap.Any("config", clusterCfg.scheduleCfg)) + log.Info("restoring config", zap.Any("config", clusterCfg.ScheduleCfg)) mergeCfg := make(map[string]interface{}) for cfgKey := range expectPDCfg { - value := clusterCfg.scheduleCfg[cfgKey] + value := clusterCfg.ScheduleCfg[cfgKey] if value == nil { // Ignore non-exist config. continue @@ -538,7 +538,8 @@ func restoreSchedulers(ctx context.Context, pd *PdController, clusterCfg cluster return nil } -func (p *PdController) makeUndoFunctionByConfig(config clusterConfig) UndoFunc { +// MakeUndoFunctionByConfig return an UndoFunc based on specified ClusterConfig +func (p *PdController) MakeUndoFunctionByConfig(config ClusterConfig) UndoFunc { restore := func(ctx context.Context) error { return restoreSchedulers(ctx, p, config) } @@ -547,22 +548,38 @@ func (p *PdController) makeUndoFunctionByConfig(config clusterConfig) UndoFunc { // RemoveSchedulers removes the schedulers that may slow down BR speed. func (p *PdController) RemoveSchedulers(ctx context.Context) (undo UndoFunc, err error) { + undo = Nop + + _, removed, err1 := p.RemoveSchedulersAndReturn(ctx) + if err1 != nil { + err = err1 + return + } + + undo = p.MakeUndoFunctionByConfig(ClusterConfig{Schedulers: removed.Schedulers, ScheduleCfg: removed.ScheduleCfg}) + return undo, errors.Trace(err) +} + +// RemoveSchedulersAndReturn pause and remove br related schedule configs and return the origin and modified configs +func (p *PdController) RemoveSchedulersAndReturn(ctx context.Context) (ClusterConfig, ClusterConfig, error) { if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil { span1 := span.Tracer().StartSpan("PdController.RemoveSchedulers", opentracing.ChildOf(span.Context())) defer span1.Finish() ctx = opentracing.ContextWithSpan(ctx, span1) } - undo = Nop + originCfg := ClusterConfig{} + removedCfg := ClusterConfig{} stores, err := p.pdClient.GetAllStores(ctx) if err != nil { - return + return originCfg, removedCfg, err } scheduleCfg, err := p.GetPDScheduleConfig(ctx) if err != nil { - return + return originCfg, removedCfg, err } - disablePDCfg := make(map[string]interface{}) + disablePDCfg := make(map[string]interface{}, len(expectPDCfg)) + originPDCfg := make(map[string]interface{}, len(expectPDCfg)) for cfgKey, cfgValFunc := range expectPDCfg { value, ok := scheduleCfg[cfgKey] if !ok { @@ -570,14 +587,17 @@ func (p *PdController) RemoveSchedulers(ctx context.Context) (undo UndoFunc, err continue } disablePDCfg[cfgKey] = cfgValFunc(len(stores), value) + originPDCfg[cfgKey] = value } - undo = p.makeUndoFunctionByConfig(clusterConfig{scheduleCfg: scheduleCfg}) + originCfg.ScheduleCfg = originPDCfg + removedCfg.ScheduleCfg = disablePDCfg + log.Debug("saved PD config", zap.Any("config", scheduleCfg)) // Remove default PD scheduler that may affect restore process. existSchedulers, err := p.ListSchedulers(ctx) if err != nil { - return + return originCfg, removedCfg, err } needRemoveSchedulers := make([]string, 0, len(existSchedulers)) for _, s := range existSchedulers { @@ -586,7 +606,30 @@ func (p *PdController) RemoveSchedulers(ctx context.Context) (undo UndoFunc, err } } + removedSchedulers, err := p.doRemoveSchedulersWith(ctx, needRemoveSchedulers, disablePDCfg) + if err != nil { + return originCfg, removedCfg, err + } + + originCfg.Schedulers = removedSchedulers + removedCfg.Schedulers = removedSchedulers + + return originCfg, removedCfg, nil +} + +// RemoveSchedulersWithCfg removes pd schedulers and configs with specified ClusterConfig +func (p *PdController) RemoveSchedulersWithCfg(ctx context.Context, removeCfg ClusterConfig) error { + _, err := p.doRemoveSchedulersWith(ctx, removeCfg.Schedulers, removeCfg.ScheduleCfg) + return err +} + +func (p *PdController) doRemoveSchedulersWith( + ctx context.Context, + needRemoveSchedulers []string, + disablePDCfg map[string]interface{}, +) ([]string, error) { var removedSchedulers []string + var err error if p.isPauseConfigEnabled() { // after 4.0.8 we can set these config with TTL removedSchedulers, err = p.pauseSchedulersAndConfigWith(ctx, needRemoveSchedulers, disablePDCfg, pdRequest) @@ -595,12 +638,11 @@ func (p *PdController) RemoveSchedulers(ctx context.Context) (undo UndoFunc, err // which doesn't have temporary config setting. err = p.doUpdatePDScheduleConfig(ctx, disablePDCfg, pdRequest) if err != nil { - return + return nil, err } removedSchedulers, err = p.pauseSchedulersAndConfigWith(ctx, needRemoveSchedulers, nil, pdRequest) } - undo = p.makeUndoFunctionByConfig(clusterConfig{scheduler: removedSchedulers, scheduleCfg: scheduleCfg}) - return undo, errors.Trace(err) + return removedSchedulers, err } // Close close the connection to pd. diff --git a/tidb-lightning.toml b/tidb-lightning.toml index 68482b4fb..c019a1265 100644 --- a/tidb-lightning.toml +++ b/tidb-lightning.toml @@ -32,6 +32,10 @@ table-concurrency = 6 # adjusted according to monitoring. # Ref: https://en.wikipedia.org/wiki/Disk_buffer#Read-ahead/read-behind # io-concurrency = 5 +# meta-schema-name is (database name) to store lightning task and table metadata. +# the meta schema and tables is store in target tidb cluster. +# this config is only used in "local" and "importer" backend. +# meta-schema-name = "lightning_metadata" # logging level = "info" From 62f10db4d3069412449102bf23e459b96f649ba1 Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 20 Apr 2021 14:07:15 +0800 Subject: [PATCH 19/32] use custom db to store lighting metas --- pkg/lightning/config/config.go | 3 +- pkg/lightning/restore/restore.go | 115 ++++++++++++++++---------- pkg/lightning/restore/restore_test.go | 7 +- 3 files changed, 77 insertions(+), 48 deletions(-) diff --git a/pkg/lightning/config/config.go b/pkg/lightning/config/config.go index af31ac997..c649de098 100644 --- a/pkg/lightning/config/config.go +++ b/pkg/lightning/config/config.go @@ -71,7 +71,8 @@ const ( defaultIndexSerialScanConcurrency = 20 defaultChecksumTableConcurrency = 2 - defaultMetaSchemaName = "lighting_metadata" + // defaultMetaSchemaName is the default database name used to store lightning metadata + defaultMetaSchemaName = "lightning_metadata" // autoDiskQuotaLocalReservedSpeed is the estimated size increase per // millisecond per write thread the local backend may gain on all engines. diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 908f28836..6e0f9d683 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -1139,6 +1139,7 @@ func (rc *Controller) restoreTables(ctx context.Context) error { finishSchedulers := func() {} // if one lightning failed abnormally, and can't determine whether it needs to switch back, // we do not do switch back automatically + cleanupFunc := func() {} switchBack := false if rc.cfg.TikvImporter.Backend == config.BackendLocal { // disable some pd schedulers @@ -1153,11 +1154,10 @@ func (rc *Controller) restoreTables(ctx context.Context) error { return errors.Trace(err) } mgr := taskMetaMgr{ - pd: pdController, - taskID: rc.cfg.TaskID, - session: db, - schemaName: rc.cfg.App.MetaSchemaName, - tableName: common.UniqueTable(rc.cfg.App.MetaSchemaName, taskMetaTableName), + pd: pdController, + taskID: rc.cfg.TaskID, + session: db, + tableName: common.UniqueTable(rc.cfg.App.MetaSchemaName, taskMetaTableName), } if err = mgr.initTask(ctx); err != nil { @@ -1181,9 +1181,16 @@ func (rc *Controller) restoreTables(ctx context.Context) error { if restoreE := restoreFn(restoreCtx); restoreE != nil { logTask.Warn("failed to restore removed schedulers, you may need to restore them manually", zap.Error(restoreE)) } + // clean up task metas if cleanupErr := mgr.cleanup(restoreCtx); cleanupErr != nil { logTask.Warn("failed to clean task metas, you may need to restore them manually", zap.Error(cleanupErr)) } + // cleanup table meta and schema db if needed. + cleanupFunc = func() { + if e := cleanupAllMetas(restoreCtx, db, rc.cfg.App.MetaSchemaName); err != nil { + logTask.Warn("failed to clean table task metas, you may need to restore them manually", zap.Error(e)) + } + } } logTask.Info("add back PD leader®ion schedulers") @@ -1196,6 +1203,11 @@ func (rc *Controller) restoreTables(ctx context.Context) error { return errors.Trace(err) } } + defer func() { + if switchBack { + cleanupFunc() + } + }() type task struct { tr *TableRestore @@ -1390,9 +1402,10 @@ func (rc *Controller) restoreTables(ctx context.Context) error { defer wg.Done() for task := range postProcessTaskChan { metaMgr := &tableMetaMgr{ - session: db, - taskID: rc.cfg.TaskID, - tr: task.tr, + session: db, + taskID: rc.cfg.TaskID, + tr: task.tr, + tableName: common.UniqueTable(rc.cfg.App.MetaSchemaName, tableMetaTableName), } // force all the remain post-process tasks to be executed _, err = task.tr.postProcess(ctx2, rc, task.cp, true, metaMgr) @@ -1427,9 +1440,10 @@ func (tr *TableRestore) restoreTable( } metaMgr := &tableMetaMgr{ - session: db, - taskID: rc.cfg.TaskID, - tr: tr, + session: db, + taskID: rc.cfg.TaskID, + tr: tr, + tableName: common.UniqueTable(rc.cfg.App.MetaSchemaName, tableMetaTableName), } // no need to do anything if the chunks are already populated @@ -2883,9 +2897,9 @@ func (m *tableMetaMgr) InitTableMeta(ctx context.Context) error { Logger: m.tr.logger, } // avoid override existing metadata if the meta is already inserted. - stmt := `INSERT IGNORE INTO ? (task_id, table_id, table_name, status) values (?, ?, ?, ?)` + stmt := fmt.Sprintf(`INSERT IGNORE INTO %s (task_id, table_id, table_name, status) values (?, ?, ?, ?)`, m.tableName) task := m.tr.logger.Begin(zap.DebugLevel, "init table meta") - err := exec.Exec(ctx, "init table meta", stmt, m.tableName, m.taskID, m.tr.tableInfo.ID, m.tr.tableName, metaStatusInitial.String()) + err := exec.Exec(ctx, "init table meta", stmt, m.taskID, m.tr.tableInfo.ID, m.tr.tableName, metaStatusInitial.String()) task.End(zap.ErrorLevel, err) return errors.Trace(err) } @@ -2963,8 +2977,8 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) return nil, 0, errors.Annotate(err, "enable pessimistic transaction failed") } err = exec.Transact(ctx, "init table allocator base", func(ctx context.Context, tx *sql.Tx) error { - query := "SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from ? WHERE table_id = ? FOR UPDATE" - rows, err := tx.QueryContext(ctx, query, m.tableName, m.tr.tableInfo.ID) + query := fmt.Sprintf("SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from %s WHERE table_id = ? FOR UPDATE", m.tableName) + rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) if err != nil { return errors.Trace(err) } @@ -3058,8 +3072,8 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) if newRowIDBase == 0 && newStatus < metaStatusRestoreStarted { newStatus = metaStatusRestoreStarted } - query = "update ? set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?" - _, err := tx.ExecContext(ctx, query, m.tableName, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, m.taskID) + query = fmt.Sprintf("update %s set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?", m.tableName) + _, err := tx.ExecContext(ctx, query, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, m.taskID) if err != nil { return errors.Trace(err) } @@ -3111,9 +3125,9 @@ func (m *tableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *ve DB: m.session, Logger: m.tr.logger, } - query := "update ? set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?" + query := fmt.Sprintf("update %s set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?", m.tableName) - return exec.Exec(ctx, "update base checksum", query, m.tableName, checksum.SumKVS(), + return exec.Exec(ctx, "update base checksum", query, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), m.tr.tableInfo.ID, m.taskID) } @@ -3122,8 +3136,8 @@ func (m *tableMetaMgr) updateTableStatus(ctx context.Context, status metaStatus) DB: m.session, Logger: m.tr.logger, } - query := "update ? set status = ? where table_id = ? and task_id = ?" - return exec.Exec(ctx, "update meta status", query, m.tableName, status.String(), m.tr.tableInfo.ID, m.taskID) + query := fmt.Sprintf("update %s set status = ? where table_id = ? and task_id = ?", m.tableName) + return exec.Exec(ctx, "update meta status", query, status.String(), m.tr.tableInfo.ID, m.taskID) } func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { @@ -3148,8 +3162,8 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum newStatus := metaStatusChecksuming needChecksum := true err = exec.Transact(ctx, "checksum pre-check", func(ctx context.Context, tx *sql.Tx) error { - query := "SELECT task_id, total_kvs_base, total_bytes_base, checksum_base, total_kvs, total_bytes, checksum, status from ? WHERE table_id = ? FOR UPDATE" - rows, err := tx.QueryContext(ctx, query, m.tableName, m.tr.tableInfo.ID) + query := fmt.Sprintf("SELECT task_id, total_kvs_base, total_bytes_base, checksum_base, total_kvs, total_bytes, checksum, status from %s WHERE table_id = ? FOR UPDATE", m.tableName) + rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) if err != nil { return errors.Annotate(err, "fetch task meta failed") } @@ -3206,8 +3220,8 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum rows.Close() closed = true - query = "update ? set total_kvs = ?, total_bytes = ?, checksum = ?, status = ? where table_id = ? and task_id = ?" - _, err = tx.ExecContext(ctx, query, m.tableName, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus.String(), m.tr.tableInfo.ID, m.taskID) + query = fmt.Sprintf("update %s set total_kvs = ?, total_bytes = ?, checksum = ?, status = ? where table_id = ? and task_id = ?", m.tableName) + _, err = tx.ExecContext(ctx, query, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus.String(), m.tr.tableInfo.ID, m.taskID) return errors.Annotate(err, "update local checksum failed") }) if err != nil { @@ -3227,15 +3241,14 @@ func (m *tableMetaMgr) FinishTable(ctx context.Context) error { DB: m.session, Logger: m.tr.logger, } - query := "DELETE FROM ? where table_id = ? and (status = 'checksuming' or status = 'checksum_skipped')" - return exec.Exec(ctx, "clean up metas", query, m.tableName, m.tr.tableInfo.ID) + query := fmt.Sprintf("DELETE FROM %s where table_id = ? and (status = 'checksuming' or status = 'checksum_skipped')", m.tableName) + return exec.Exec(ctx, "clean up metas", query, m.tr.tableInfo.ID) } type taskMetaMgr struct { - session *sql.DB - taskID int64 - pd *pdutil.PdController - schemaName string + session *sql.DB + taskID int64 + pd *pdutil.PdController // unique name of task meta table tableName string } @@ -3246,8 +3259,8 @@ func (m *taskMetaMgr) InitTaskMeta(ctx context.Context) error { Logger: log.L(), } // avoid override existing metadata if the meta is already inserted. - stmt := `INSERT IGNORE INTO ? (task_id, status) values (?, ?)` - err := exec.Exec(ctx, "init task meta", stmt, m.tableName, m.taskID, metaStatusInitial.String()) + stmt := fmt.Sprintf(`INSERT IGNORE INTO %s (task_id, status) values (?, ?)`, m.tableName) + err := exec.Exec(ctx, "init task meta", stmt, m.taskID, metaStatusInitial.String()) return errors.Trace(err) } @@ -3301,8 +3314,8 @@ func (m *taskMetaMgr) initTask(ctx context.Context) error { Logger: log.L(), } // avoid override existing metadata if the meta is already inserted. - stmt := `INSERT IGNORE INTO ? (task_id, status) values (?, ?)` - err := exec.Exec(ctx, "init task meta", stmt, m.tableName, m.taskID, taskMetaStatusInitial.String()) + stmt := fmt.Sprintf(`INSERT IGNORE INTO %s (task_id, status) values (?, ?)`, m.tableName) + err := exec.Exec(ctx, "init task meta", stmt, m.taskID, taskMetaStatusInitial.String()) return errors.Trace(err) } @@ -3325,8 +3338,8 @@ func (m *taskMetaMgr) checkAndPausePdSchedulers(ctx context.Context) (pdutil.Und paused := false var pausedCfg storedCfgs err = exec.Transact(ctx, "check and pause schedulers", func(ctx context.Context, tx *sql.Tx) error { - query := "SELECT task_id, pd_cfgs, status from ? FOR UPDATE" - rows, err := tx.QueryContext(ctx, query, m.tableName) + query := fmt.Sprintf("SELECT task_id, pd_cfgs, status from %s FOR UPDATE", m.tableName) + rows, err := tx.QueryContext(ctx, query) if err != nil { return errors.Annotate(err, "fetch task meta failed") } @@ -3389,8 +3402,8 @@ func (m *taskMetaMgr) checkAndPausePdSchedulers(ctx context.Context) (pdutil.Und return errors.Trace(err) } - query = "update ? set pd_cfgs = ?, status = ? where task_id = ?" - _, err = tx.ExecContext(ctx, query, m.tableName, string(jsonByts), taskMetaStatusScheduleSet.String(), m.taskID) + query = fmt.Sprintf("update %s set pd_cfgs = ?, status = ? where task_id = ?", m.tableName) + _, err = tx.ExecContext(ctx, query, string(jsonByts), taskMetaStatusScheduleSet.String(), m.taskID) return errors.Annotate(err, "update task pd configs failed") }) @@ -3428,8 +3441,8 @@ func (m *taskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { switchBack := true err = exec.Transact(ctx, "check and finish schedulers", func(ctx context.Context, tx *sql.Tx) error { - query := "SELECT task_id, status from ? FOR UPDATE" - rows, err := tx.QueryContext(ctx, query, m.tableName) + query := fmt.Sprintf("SELECT task_id, status from %s FOR UPDATE", m.tableName) + rows, err := tx.QueryContext(ctx, query) if err != nil { return errors.Annotate(err, "fetch task meta failed") } @@ -3468,8 +3481,8 @@ func (m *taskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { } closed = true - query = "update ? set status = ? where task_id = ?" - _, err = tx.ExecContext(ctx, query, m.tableName, newStatus.String(), m.taskID) + query = fmt.Sprintf("update %s set status = ? where task_id = ?", m.tableName) + _, err = tx.ExecContext(ctx, query, newStatus.String(), m.taskID) return errors.Trace(err) }) @@ -3483,7 +3496,21 @@ func (m *taskMetaMgr) cleanup(ctx context.Context) error { Logger: log.L(), } // avoid override existing metadata if the meta is already inserted. - if err := exec.Exec(ctx, "cleanup task meta tables", "DROP DATABASE ?;", m.schemaName); err != nil { + stmt := fmt.Sprintf("DROP TABLE %s;", m.tableName) + if err := exec.Exec(ctx, "cleanup task meta tables", stmt); err != nil { + return errors.Trace(err) + } + return nil +} + +func cleanupAllMetas(ctx context.Context, dbExecutor common.DBExecutor, schema string) error { + exec := &common.SQLWithRetry{ + DB: dbExecutor, + Logger: log.L(), + } + // avoid override existing metadata if the meta is already inserted. + stmt := fmt.Sprintf("DROP DATABASE %s;", schema) + if err := exec.Exec(ctx, "cleanup task meta tables", stmt); err != nil { return errors.Trace(err) } return nil diff --git a/pkg/lightning/restore/restore_test.go b/pkg/lightning/restore/restore_test.go index d14f4b772..f82106583 100644 --- a/pkg/lightning/restore/restore_test.go +++ b/pkg/lightning/restore/restore_test.go @@ -1492,9 +1492,10 @@ func (s *metaMgrSuite) SetUpTest(c *C) { c.Assert(err, IsNil) s.mgr = &tableMetaMgr{ - session: db, - taskID: 1, - tr: s.tr, + session: db, + taskID: 1, + tr: s.tr, + tableName: common.UniqueTable("test", tableMetaTableName), } s.mockDB = m s.checksumMgr = &testChecksumMgr{} From 3beafa5323e910f68a7878d0683cf555d14229f2 Mon Sep 17 00:00:00 2001 From: glorv Date: Mon, 24 May 2021 13:43:21 +0800 Subject: [PATCH 20/32] fix some bug in task and table meta --- pkg/lightning/restore/restore.go | 33 ++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 72c0c8b90..a3638ab69 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -1000,6 +1000,7 @@ func (rc *Controller) buildRunPeriodicActionAndCancelFunc(ctx context.Context, s cancelFuncs = append(cancelFuncs, func(bool) { switchModeTicker.Stop() }) cancelFuncs = append(cancelFuncs, func(do bool) { if do { + log.L().Info("switch to normal mode") if err := rc.switchToNormalMode(ctx); err != nil { log.L().Warn("switch tikv to normal mode failed", zap.Error(err)) } @@ -1130,6 +1131,7 @@ func (rc *Controller) buildRunPeriodicActionAndCancelFunc(ctx context.Context, s } } }, func(do bool) { + log.L().Info("cancel periodic actions", zap.Bool("do", do)) for _, f := range cancelFuncs { f(do) } @@ -1486,8 +1488,7 @@ func (tr *TableRestore) restoreTable( } // "show table next_row_id" is only available after v4.0.0 - if tidbVersion.Major >= 4 && rc.cfg.TikvImporter.Backend != config.BackendTiDB && - (common.TableHasAutoRowID(tr.tableInfo.Core) || tr.tableInfo.Core.GetAutoIncrementColInfo() != nil || tr.tableInfo.Core.ContainsAutoRandomBits()) { + if tidbVersion.Major >= 4 && rc.cfg.TikvImporter.Backend != config.BackendTiDB { // first, insert a new-line into meta table if err = metaMgr.InitTableMeta(ctx); err != nil { return false, err @@ -3070,6 +3071,7 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) if err != nil { return nil, 0, errors.Annotate(err, "enable pessimistic transaction failed") } + needAutoID := common.TableHasAutoRowID(m.tr.tableInfo.Core) || m.tr.tableInfo.Core.GetAutoIncrementColInfo() != nil || m.tr.tableInfo.Core.ContainsAutoRandomBits() err = exec.Transact(ctx, "init table allocator base", func(ctx context.Context, tx *sql.Tx) error { query := fmt.Sprintf("SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from %s WHERE table_id = ? FOR UPDATE", m.tableName) rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) @@ -3124,7 +3126,7 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) // no enough info are available, fetch row_id max for table if curStatus == metaStatusInitial { - if maxRowIDMax == 0 { + if needAutoID && maxRowIDMax == 0 { // NOTE: currently, if a table contains auto_incremental unique key and _tidb_rowid, // the `show table next_row_id` will returns the unique key field only. var autoIDField string @@ -3163,7 +3165,7 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) newRowIDBase = maxRowIDMax newRowIDMax = newRowIDBase + rawRowIDMax // table contains no data, can skip checksum - if newRowIDBase == 0 && newStatus < metaStatusRestoreStarted { + if needAutoID && newRowIDBase == 0 && newStatus < metaStatusRestoreStarted { newStatus = metaStatusRestoreStarted } query = fmt.Sprintf("update %s set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?", m.tableName) @@ -3184,7 +3186,7 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) // need to do checksum and update checksum meta since we are the first one. if curStatus < metaStatusRestoreStarted { // table contains data but haven't do checksum yet - if newRowIDBase > 0 && baseTotalKvs == 0 { + if (newRowIDBase > 0 || !needAutoID) && baseTotalKvs == 0 { remoteCk, err := DoChecksum(ctx, m.tr.tableInfo) if err != nil { return nil, 0, errors.Trace(err) @@ -3211,6 +3213,11 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) ck := verify.MakeKVChecksum(baseTotalBytes, baseTotalKvs, baseChecksum) checksum = &ck } + log.L().Info("allocate table row_id base", zap.String("table", m.tr.tableName), + zap.Int64("row_id_base", newRowIDBase)) + if checksum != nil { + log.L().Info("checksum base", zap.Any("checksum", checksum)) + } return checksum, newRowIDBase, nil } @@ -3286,7 +3293,7 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum } if taskID == m.taskID { - if status > metaStatusChecksuming { + if status >= metaStatusChecksuming { newStatus = status needChecksum = status == metaStatusChecksuming return nil @@ -3327,6 +3334,8 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum ck := verify.MakeKVChecksum(totalBytes, totalKvs, totalChecksum) remoteChecksum = &ck } + log.L().Info("check table checksum", zap.String("table", m.tr.tableName), + zap.Bool("checksum", needChecksum), zap.String("new_status", newStatus.String())) return needChecksum, remoteChecksum, nil } @@ -3602,6 +3611,18 @@ func cleanupAllMetas(ctx context.Context, dbExecutor common.DBExecutor, schema s DB: dbExecutor, Logger: log.L(), } + + // check if all tables are finished + query := fmt.Sprintf("SELECT COUNT(*) from %s", common.UniqueTable(schema, tableMetaTableName)) + var cnt int + if err := exec.QueryRow(ctx, "fetch table meta row count", query, &cnt); err != nil { + return errors.Trace(err) + } + if cnt > 0 { + log.L().Warn("there are unfinished table in table meta table, cleanup skipped.") + return nil + } + // avoid override existing metadata if the meta is already inserted. stmt := fmt.Sprintf("DROP DATABASE %s;", schema) if err := exec.Exec(ctx, "cleanup task meta tables", stmt); err != nil { From 06c8a388f5f0450c3dc12cd0e61a5ae5e43125a6 Mon Sep 17 00:00:00 2001 From: glorv Date: Mon, 24 May 2021 16:05:34 +0800 Subject: [PATCH 21/32] fix tests --- pkg/lightning/restore/restore.go | 177 ++++++++++++++------------ pkg/lightning/restore/restore_test.go | 71 ++++++++++- 2 files changed, 162 insertions(+), 86 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index a3638ab69..86791d3ba 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -202,6 +202,7 @@ type Controller struct { closedEngineLimit *worker.Pool store storage.ExternalStorage + metaMgrBuilder metaMgrBuilder diskQuotaLock sync.RWMutex diskQuotaState int32 @@ -305,6 +306,17 @@ func NewRestoreControllerWithPauser( ts = oracle.ComposeTS(physical, logical) } + // TODO: support Lightning via SQL + db, err := g.GetDB() + if err != nil { + return nil, errors.Trace(err) + } + metaBuilder := &dbMetaMgrBuilder{ + db: db, + taskID: cfg.TaskID, + schema: cfg.App.MetaSchemaName, + } + rc := &Controller{ cfg: cfg, dbMetas: dbMetas, @@ -324,8 +336,9 @@ func NewRestoreControllerWithPauser( saveCpCh: make(chan saveCp), closedEngineLimit: worker.NewPool(ctx, cfg.App.TableConcurrency*2, "closed-engine"), - store: s, - ts: ts, + store: s, + ts: ts, + metaMgrBuilder: metaBuilder, } return rc, nil @@ -1160,24 +1173,14 @@ func (rc *Controller) restoreTables(ctx context.Context) error { return errors.Trace(err) } - db, err := rc.tidbGlue.GetDB() - if err != nil { - return errors.Trace(err) - } - mgr := taskMetaMgr{ - pd: pdController, - taskID: rc.cfg.TaskID, - session: db, - tableName: common.UniqueTable(rc.cfg.App.MetaSchemaName, taskMetaTableName), - } - - if err = mgr.initTask(ctx); err != nil { + mgr := rc.metaMgrBuilder.TaskMetaMgr(pdController) + if err = mgr.InitTask(ctx); err != nil { return err } logTask.Info("removing PD leader®ion schedulers") - restoreFn, err := mgr.checkAndPausePdSchedulers(ctx) + restoreFn, err := mgr.CheckAndPausePdSchedulers(ctx) finishSchedulers = func() { if restoreFn != nil { // use context.Background to make sure this restore function can still be executed even if ctx is canceled @@ -1193,12 +1196,12 @@ func (rc *Controller) restoreTables(ctx context.Context) error { logTask.Warn("failed to restore removed schedulers, you may need to restore them manually", zap.Error(restoreE)) } // clean up task metas - if cleanupErr := mgr.cleanup(restoreCtx); cleanupErr != nil { + if cleanupErr := mgr.Cleanup(restoreCtx); cleanupErr != nil { logTask.Warn("failed to clean task metas, you may need to restore them manually", zap.Error(cleanupErr)) } // cleanup table meta and schema db if needed. cleanupFunc = func() { - if e := cleanupAllMetas(restoreCtx, db, rc.cfg.App.MetaSchemaName); err != nil { + if e := mgr.CleanupAllMetas(restoreCtx); err != nil { logTask.Warn("failed to clean table task metas, you may need to restore them manually", zap.Error(e)) } } @@ -1262,7 +1265,9 @@ func (rc *Controller) restoreTables(ctx context.Context) error { for task := range taskCh { tableLogTask := task.tr.logger.Begin(zap.InfoLevel, "restore table") web.BroadcastTableCheckpoint(task.tr.tableName, task.cp) + fmt.Printf("before restore table\n") needPostProcess, err := task.tr.restoreTable(ctx2, rc, task.cp) + fmt.Printf("after restore table\n") err = errors.Annotatef(err, "restore table %s failed", task.tr.tableName) tableLogTask.End(zap.ErrorLevel, err) web.BroadcastError(task.tr.tableName, err) @@ -1400,24 +1405,13 @@ func (rc *Controller) restoreTables(ctx context.Context) error { finishFuncCalled = true close(postProcessTaskChan) - // TODO: support Lightning via SQL - db, err := rc.tidbGlue.GetDB() - if err != nil { - return errors.Trace(err) - } - // otherwise, we should run all tasks in the post-process task chan for i := 0; i < rc.cfg.App.TableConcurrency; i++ { wg.Add(1) go func() { defer wg.Done() for task := range postProcessTaskChan { - metaMgr := &tableMetaMgr{ - session: db, - taskID: rc.cfg.TaskID, - tr: task.tr, - tableName: common.UniqueTable(rc.cfg.App.MetaSchemaName, tableMetaTableName), - } + metaMgr := rc.metaMgrBuilder.TableMetaMgr(task.tr) // force all the remain post-process tasks to be executed _, err = task.tr.postProcess(ctx2, rc, task.cp, true, metaMgr) restoreErr.Set(err) @@ -1444,19 +1438,7 @@ func (tr *TableRestore) restoreTable( default: } - // TODO: support Lightning via SQL - db, err := rc.tidbGlue.GetDB() - if err != nil { - return false, errors.Trace(err) - } - - metaMgr := &tableMetaMgr{ - session: db, - taskID: rc.cfg.TaskID, - tr: tr, - tableName: common.UniqueTable(rc.cfg.App.MetaSchemaName, tableMetaTableName), - } - + metaMgr := rc.metaMgrBuilder.TableMetaMgr(tr) // no need to do anything if the chunks are already populated if len(cp.Engines) > 0 { tr.logger.Info("reusing engines and files info from checkpoint", @@ -1488,7 +1470,7 @@ func (tr *TableRestore) restoreTable( } // "show table next_row_id" is only available after v4.0.0 - if tidbVersion.Major >= 4 && rc.cfg.TikvImporter.Backend != config.BackendTiDB { + if tidbVersion.Major >= 4 && rc.cfg.TikvImporter.Backend == config.BackendLocal { // first, insert a new-line into meta table if err = metaMgr.InitTableMeta(ctx); err != nil { return false, err @@ -1539,12 +1521,12 @@ func (tr *TableRestore) restoreTable( } // 2. Restore engines (if still needed) - err = tr.restoreEngines(ctx, rc, cp) + err := tr.restoreEngines(ctx, rc, cp) if err != nil { return false, errors.Trace(err) } - err = metaMgr.updateTableStatus(ctx, metaStatusRestoreFinished) + err = metaMgr.UpdateTableStatus(ctx, metaStatusRestoreFinished) if err != nil { return false, errors.Trace(err) } @@ -1977,7 +1959,7 @@ func (tr *TableRestore) postProcess( rc *Controller, cp *checkpoints.TableCheckpoint, forcePostProcess bool, - metaMgr *tableMetaMgr, + metaMgr tableMetaMgr, ) (bool, error) { // there are no data in this table, no need to do post process // this is important for tables that are just the dump table of views @@ -2032,7 +2014,7 @@ func (tr *TableRestore) postProcess( if forcePostProcess || !rc.cfg.PostRestore.PostProcessAtLast { tr.logger.Info("local checksum", zap.Object("checksum", &localChecksum)) - needChecksum, baseTotalChecksum, err := metaMgr.checkAndUpdateLocalChecksum(ctx, &localChecksum) + needChecksum, baseTotalChecksum, err := metaMgr.CheckAndUpdateLocalChecksum(ctx, &localChecksum) if err != nil { return false, err } @@ -2979,14 +2961,53 @@ func (cr *chunkRestore) restore( } } -type tableMetaMgr struct { +type metaMgrBuilder interface { + TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr + TableMetaMgr(tr *TableRestore) tableMetaMgr +} + +type dbMetaMgrBuilder struct { + db *sql.DB + taskID int64 + schema string +} + +func (b *dbMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr { + return &dbTaskMetaMgr{ + session: b.db, + taskID: b.taskID, + pd: pd, + tableName: common.UniqueTable(b.schema, taskMetaTableName), + schemaName: b.schema, + } +} + +func (b *dbMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr { + return &dbTableMetaMgr{ + session: b.db, + taskID: b.taskID, + tr: tr, + tableName: common.UniqueTable(b.schema, tableMetaTableName), + } +} + +type tableMetaMgr interface { + InitTableMeta(ctx context.Context) error + AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) + UpdateTableStatus(ctx context.Context, status metaStatus) error + UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error + CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) + FinishTable(ctx context.Context) error +} + +type dbTableMetaMgr struct { session *sql.DB taskID int64 tr *TableRestore tableName string } -func (m *tableMetaMgr) InitTableMeta(ctx context.Context) error { +func (m *dbTableMetaMgr) InitTableMeta(ctx context.Context) error { exec := &common.SQLWithRetry{ DB: m.session, Logger: m.tr.logger, @@ -3053,7 +3074,7 @@ func parseMetaStatus(s string) (metaStatus, error) { } } -func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) { +func (m *dbTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) { conn, err := m.session.Conn(ctx) if err != nil { return nil, 0, errors.Trace(err) @@ -3205,7 +3226,7 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) } m.tr.logger.Info("checksum before restore table", zap.Object("checksum", checksum)) - } else if err = m.updateTableStatus(ctx, metaStatusRestoreStarted); err != nil { + } else if err = m.UpdateTableStatus(ctx, metaStatusRestoreStarted); err != nil { return nil, 0, errors.Trace(err) } } @@ -3221,7 +3242,7 @@ func (m *tableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) return checksum, newRowIDBase, nil } -func (m *tableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error { +func (m *dbTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error { exec := &common.SQLWithRetry{ DB: m.session, Logger: m.tr.logger, @@ -3232,7 +3253,7 @@ func (m *tableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *ve checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), m.tr.tableInfo.ID, m.taskID) } -func (m *tableMetaMgr) updateTableStatus(ctx context.Context, status metaStatus) error { +func (m *dbTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error { exec := &common.SQLWithRetry{ DB: m.session, Logger: m.tr.logger, @@ -3241,7 +3262,7 @@ func (m *tableMetaMgr) updateTableStatus(ctx context.Context, status metaStatus) return exec.Exec(ctx, "update meta status", query, status.String(), m.tr.tableInfo.ID, m.taskID) } -func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { +func (m *dbTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { conn, err := m.session.Conn(ctx) if err != nil { return false, nil, errors.Trace(err) @@ -3339,7 +3360,7 @@ func (m *tableMetaMgr) checkAndUpdateLocalChecksum(ctx context.Context, checksum return needChecksum, remoteChecksum, nil } -func (m *tableMetaMgr) FinishTable(ctx context.Context) error { +func (m *dbTableMetaMgr) FinishTable(ctx context.Context) error { exec := &common.SQLWithRetry{ DB: m.session, Logger: m.tr.logger, @@ -3348,23 +3369,21 @@ func (m *tableMetaMgr) FinishTable(ctx context.Context) error { return exec.Exec(ctx, "clean up metas", query, m.tr.tableInfo.ID) } -type taskMetaMgr struct { +type taskMetaMgr interface { + InitTask(ctx context.Context) error + CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) + CheckAndFinishRestore(ctx context.Context) (bool, error) + Cleanup(ctx context.Context) error + CleanupAllMetas(ctx context.Context) error +} + +type dbTaskMetaMgr struct { session *sql.DB taskID int64 pd *pdutil.PdController // unique name of task meta table - tableName string -} - -func (m *taskMetaMgr) InitTaskMeta(ctx context.Context) error { - exec := &common.SQLWithRetry{ - DB: m.session, - Logger: log.L(), - } - // avoid override existing metadata if the meta is already inserted. - stmt := fmt.Sprintf(`INSERT IGNORE INTO %s (task_id, status) values (?, ?)`, m.tableName) - err := exec.Exec(ctx, "init task meta", stmt, m.taskID, metaStatusInitial.String()) - return errors.Trace(err) + tableName string + schemaName string } type taskMetaStatus uint32 @@ -3408,10 +3427,10 @@ func parseTaskMetaStatus(s string) (taskMetaStatus, error) { type storedCfgs struct { PauseCfg pdutil.ClusterConfig `json:"paused"` - RestoreCFg pdutil.ClusterConfig `json:"restore"` + RestoreCfg pdutil.ClusterConfig `json:"restore"` } -func (m *taskMetaMgr) initTask(ctx context.Context) error { +func (m *dbTaskMetaMgr) InitTask(ctx context.Context) error { exec := &common.SQLWithRetry{ DB: m.session, Logger: log.L(), @@ -3422,7 +3441,7 @@ func (m *taskMetaMgr) initTask(ctx context.Context) error { return errors.Trace(err) } -func (m *taskMetaMgr) checkAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { +func (m *dbTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { conn, err := m.session.Conn(ctx) if err != nil { return nil, errors.Trace(err) @@ -3499,7 +3518,7 @@ func (m *taskMetaMgr) checkAndPausePdSchedulers(ctx context.Context) (pdutil.Und } paused = true - pausedCfg = storedCfgs{PauseCfg: removed, RestoreCFg: orig} + pausedCfg = storedCfgs{PauseCfg: removed, RestoreCfg: orig} jsonByts, err := json.Marshal(&pausedCfg) if err != nil { return errors.Trace(err) @@ -3524,10 +3543,10 @@ func (m *taskMetaMgr) checkAndPausePdSchedulers(ctx context.Context) (pdutil.Und } } - return m.pd.MakeUndoFunctionByConfig(pausedCfg.RestoreCFg), nil + return m.pd.MakeUndoFunctionByConfig(pausedCfg.RestoreCfg), nil } -func (m *taskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { +func (m *dbTaskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { conn, err := m.session.Conn(ctx) if err != nil { return false, errors.Trace(err) @@ -3593,7 +3612,7 @@ func (m *taskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { return switchBack, err } -func (m *taskMetaMgr) cleanup(ctx context.Context) error { +func (m *dbTaskMetaMgr) Cleanup(ctx context.Context) error { exec := &common.SQLWithRetry{ DB: m.session, Logger: log.L(), @@ -3606,14 +3625,14 @@ func (m *taskMetaMgr) cleanup(ctx context.Context) error { return nil } -func cleanupAllMetas(ctx context.Context, dbExecutor common.DBExecutor, schema string) error { +func (m *dbTaskMetaMgr) CleanupAllMetas(ctx context.Context) error { exec := &common.SQLWithRetry{ - DB: dbExecutor, + DB: m.session, Logger: log.L(), } // check if all tables are finished - query := fmt.Sprintf("SELECT COUNT(*) from %s", common.UniqueTable(schema, tableMetaTableName)) + query := fmt.Sprintf("SELECT COUNT(*) from %s", common.UniqueTable(m.schemaName, tableMetaTableName)) var cnt int if err := exec.QueryRow(ctx, "fetch table meta row count", query, &cnt); err != nil { return errors.Trace(err) @@ -3624,7 +3643,7 @@ func cleanupAllMetas(ctx context.Context, dbExecutor common.DBExecutor, schema s } // avoid override existing metadata if the meta is already inserted. - stmt := fmt.Sprintf("DROP DATABASE %s;", schema) + stmt := fmt.Sprintf("DROP DATABASE %s;", m.schemaName) if err := exec.Exec(ctx, "cleanup task meta tables", stmt); err != nil { return errors.Trace(err) } diff --git a/pkg/lightning/restore/restore_test.go b/pkg/lightning/restore/restore_test.go index c2cf9e26e..13dc81431 100644 --- a/pkg/lightning/restore/restore_test.go +++ b/pkg/lightning/restore/restore_test.go @@ -54,6 +54,7 @@ import ( "github.com/pingcap/br/pkg/lightning/web" "github.com/pingcap/br/pkg/lightning/worker" "github.com/pingcap/br/pkg/mock" + "github.com/pingcap/br/pkg/pdutil" "github.com/pingcap/br/pkg/storage" "github.com/pingcap/br/pkg/version/build" ) @@ -866,6 +867,7 @@ func (s *tableRestoreSuite) TestTableRestoreMetrics(c *C) { c.Assert(err, IsNil) cpDB := checkpoints.NewNullCheckpointsDB() + g := mock.NewMockGlue(controller) rc := &Controller{ cfg: cfg, dbMetas: []*mydump.MDDatabaseMeta{ @@ -885,17 +887,22 @@ func (s *tableRestoreSuite) TestTableRestoreMetrics(c *C) { saveCpCh: chptCh, pauser: DeliverPauser, backend: noop.NewNoopBackend(), - tidbGlue: mock.NewMockGlue(controller), + tidbGlue: g, errorSummaries: makeErrorSummaries(log.L()), tls: tls, checkpointsDB: cpDB, closedEngineLimit: worker.NewPool(ctx, 1, "closed_engine"), store: s.store, + metaMgrBuilder: testMetaMgrBuilder{}, } go func() { for range chptCh { } }() + exec := mock.NewMockSQLExecutor(controller) + g.EXPECT().GetSQLExecutor().Return(exec).AnyTimes() + exec.EXPECT().ObtainStringWithLog(gomock.Any(), "SELECT version()", gomock.Any(), gomock.Any()). + Return("5.7.25-TiDB-v5.0.1", nil).AnyTimes() web.BroadcastInitProgress(rc.dbMetas) @@ -914,6 +921,56 @@ func (s *tableRestoreSuite) TestTableRestoreMetrics(c *C) { c.Assert(tableFinished-tableFinishedBase, Equals, float64(1)) } +type testMetaMgrBuilder struct{} + +func (b testMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr { + return testTaskMetaMgr{} +} +func (b testMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr { + return testTableMetaMgr{} +} + +type testTaskMetaMgr struct{} + +func (m testTaskMetaMgr) InitTask(ctx context.Context) error { + return nil +} +func (m testTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { + return func(ctx context.Context) error { + return nil + }, nil +} +func (m testTaskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { + return false, nil +} +func (m testTaskMetaMgr) Cleanup(ctx context.Context) error { + return nil +} +func (m testTaskMetaMgr) CleanupAllMetas(ctx context.Context) error { + return nil +} + +type testTableMetaMgr struct{} + +func (m testTableMetaMgr) InitTableMeta(ctx context.Context) error { + return nil +} +func (m testTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verification.KVChecksum, int64, error) { + return nil, 0, nil +} +func (m testTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error { + return nil +} +func (m testTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verification.KVChecksum) error { + return nil +} +func (m testTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verification.KVChecksum) (bool, *verification.KVChecksum, error) { + return false, nil, nil +} +func (m testTableMetaMgr) FinishTable(ctx context.Context) error { + return nil +} + var _ = Suite(&chunkRestoreSuite{}) type chunkRestoreSuite struct { @@ -1506,7 +1563,7 @@ type metaMgrSuite struct { dbHandle *sql.DB mockDB sqlmock.Sqlmock tr *TableRestore - mgr *tableMetaMgr + mgr *dbTableMetaMgr checksumMgr *testChecksumMgr } @@ -1542,7 +1599,7 @@ func (s *metaMgrSuite) SetUpTest(c *C) { db, m, err := sqlmock.New() c.Assert(err, IsNil) - s.mgr = &tableMetaMgr{ + s.mgr = &dbTableMetaMgr{ session: db, taskID: 1, tr: s.tr, @@ -1686,7 +1743,7 @@ func (s *metaMgrSuite) prepareMock(rowsVal [][]driver.Value, nextRowID *int64, u for _, r := range rowsVal { rows = rows.AddRow(r...) } - s.mockDB.ExpectQuery("\\QSELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from mysql.brie_sub_tasks WHERE table_id = ? FOR UPDATE\\E"). + s.mockDB.ExpectQuery("\\QSELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from `test`.`table_meta` WHERE table_id = ? FOR UPDATE\\E"). WithArgs(int64(1)). WillReturnRows(rows) if nextRowID != nil { @@ -1696,7 +1753,7 @@ func (s *metaMgrSuite) prepareMock(rowsVal [][]driver.Value, nextRowID *int64, u } if len(updateArgs) > 0 { - s.mockDB.ExpectExec("\\Qupdate mysql.brie_sub_tasks set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?\\E"). + s.mockDB.ExpectExec("\\Qupdate `test`.`table_meta` set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?\\E"). WithArgs(updateArgs...). WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) } @@ -1704,7 +1761,7 @@ func (s *metaMgrSuite) prepareMock(rowsVal [][]driver.Value, nextRowID *int64, u s.mockDB.ExpectCommit() if checksum != nil { - s.mockDB.ExpectExec("\\Qupdate mysql.brie_sub_tasks set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?\\E"). + s.mockDB.ExpectExec("\\Qupdate `test`.`table_meta` set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?\\E"). WithArgs(checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), int64(1), int64(1)). WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) s.checksumMgr.checksum = RemoteChecksum{ @@ -1715,7 +1772,7 @@ func (s *metaMgrSuite) prepareMock(rowsVal [][]driver.Value, nextRowID *int64, u } if updateStatus != nil { - s.mockDB.ExpectExec("\\Qupdate mysql.brie_sub_tasks set status = ? where table_id = ? and task_id = ?\\E"). + s.mockDB.ExpectExec("\\Qupdate `test`.`table_meta` set status = ? where table_id = ? and task_id = ?\\E"). WithArgs(*updateStatus, int64(1), int64(1)). WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) } From e25fa789f071bb3c94783653232aacf20be974df Mon Sep 17 00:00:00 2001 From: glorv Date: Mon, 24 May 2021 17:46:28 +0800 Subject: [PATCH 22/32] fix build --- pkg/lightning/restore/restore.go | 22 +++++++++++++++------- pkg/lightning/restore/restore_test.go | 10 ++++++++++ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 86791d3ba..469bd16f4 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -993,18 +993,22 @@ func (rc *Controller) listenCheckpointUpdates() { // buildRunPeriodicActionAndCancelFunc build the runPeriodicAction func and a cancel func func (rc *Controller) buildRunPeriodicActionAndCancelFunc(ctx context.Context, stop <-chan struct{}) (func(), func(bool)) { cancelFuncs := make([]func(bool), 0) - + closeFuncs := make([]func(), 0) // a nil channel blocks forever. // if the cron duration is zero we use the nil channel to skip the action. var logProgressChan <-chan time.Time if rc.cfg.Cron.LogProgress.Duration > 0 { logProgressTicker := time.NewTicker(rc.cfg.Cron.LogProgress.Duration) - defer logProgressTicker.Stop() + closeFuncs = append(closeFuncs, func() { + logProgressTicker.Stop() + }) logProgressChan = logProgressTicker.C } glueProgressTicker := time.NewTicker(3 * time.Second) - defer glueProgressTicker.Stop() + closeFuncs = append(closeFuncs, func() { + glueProgressTicker.Stop() + }) var switchModeChan <-chan time.Time // tidb backend don't need to switch tikv to import mode @@ -1018,7 +1022,6 @@ func (rc *Controller) buildRunPeriodicActionAndCancelFunc(ctx context.Context, s log.L().Warn("switch tikv to normal mode failed", zap.Error(err)) } } - }) switchModeChan = switchModeTicker.C } @@ -1032,6 +1035,11 @@ func (rc *Controller) buildRunPeriodicActionAndCancelFunc(ctx context.Context, s } return func() { + defer func() { + for _, f := range closeFuncs { + f() + } + }() // tidb backend don't need to switch tikv to import mode if rc.cfg.TikvImporter.Backend != config.BackendTiDB && rc.cfg.Cron.SwitchMode.Duration > 0 { rc.switchToImportMode(ctx) @@ -1400,9 +1408,9 @@ func (rc *Controller) restoreTables(ctx context.Context) error { // stop periodic tasks for restore table such as pd schedulers and switch-mode tasks. // this can help make cluster switching back to normal state more quickly. - finishSchedulers() - cancelFunc(switchBack) - finishFuncCalled = true + // finishSchedulers() + // cancelFunc(switchBack) + // finishFuncCalled = true close(postProcessTaskChan) // otherwise, we should run all tasks in the post-process task chan diff --git a/pkg/lightning/restore/restore_test.go b/pkg/lightning/restore/restore_test.go index 13dc81431..48a2ec650 100644 --- a/pkg/lightning/restore/restore_test.go +++ b/pkg/lightning/restore/restore_test.go @@ -926,6 +926,7 @@ type testMetaMgrBuilder struct{} func (b testMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr { return testTaskMetaMgr{} } + func (b testMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr { return testTableMetaMgr{} } @@ -935,17 +936,21 @@ type testTaskMetaMgr struct{} func (m testTaskMetaMgr) InitTask(ctx context.Context) error { return nil } + func (m testTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { return func(ctx context.Context) error { return nil }, nil } + func (m testTaskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { return false, nil } + func (m testTaskMetaMgr) Cleanup(ctx context.Context) error { return nil } + func (m testTaskMetaMgr) CleanupAllMetas(ctx context.Context) error { return nil } @@ -955,18 +960,23 @@ type testTableMetaMgr struct{} func (m testTableMetaMgr) InitTableMeta(ctx context.Context) error { return nil } + func (m testTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verification.KVChecksum, int64, error) { return nil, 0, nil } + func (m testTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error { return nil } + func (m testTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verification.KVChecksum) error { return nil } + func (m testTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verification.KVChecksum) (bool, *verification.KVChecksum, error) { return false, nil, nil } + func (m testTableMetaMgr) FinishTable(ctx context.Context) error { return nil } From 6a9ee4fd0026ad6d2d04c6b1c01b3fb3e63844f5 Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 25 May 2021 10:08:31 +0800 Subject: [PATCH 23/32] add log --- pkg/lightning/common/util.go | 1 + pkg/lightning/restore/restore.go | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/lightning/common/util.go b/pkg/lightning/common/util.go index f3b217fd1..c04985d5a 100644 --- a/pkg/lightning/common/util.go +++ b/pkg/lightning/common/util.go @@ -142,6 +142,7 @@ outside: logger.Warn(purpose+" failed but going to try again", log.ShortError(err)) continue default: + logger.Warn(purpose+" failed with no retry", log.ShortError(err)) break outside } } diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 469bd16f4..860dfc3f2 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -1273,9 +1273,7 @@ func (rc *Controller) restoreTables(ctx context.Context) error { for task := range taskCh { tableLogTask := task.tr.logger.Begin(zap.InfoLevel, "restore table") web.BroadcastTableCheckpoint(task.tr.tableName, task.cp) - fmt.Printf("before restore table\n") needPostProcess, err := task.tr.restoreTable(ctx2, rc, task.cp) - fmt.Printf("after restore table\n") err = errors.Annotatef(err, "restore table %s failed", task.tr.tableName) tableLogTask.End(zap.ErrorLevel, err) web.BroadcastError(task.tr.tableName, err) From 3df592b1715e18b08e0cb0b985d867eb8abe38db Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 25 May 2021 10:42:02 +0800 Subject: [PATCH 24/32] fix meta mgr for tidb backend --- pkg/lightning/restore/restore.go | 134 ++++++++++++++++++++------ pkg/lightning/restore/restore_test.go | 63 +----------- 2 files changed, 107 insertions(+), 90 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 860dfc3f2..03dc9a129 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -59,6 +59,7 @@ import ( "github.com/pingcap/br/pkg/lightning/web" "github.com/pingcap/br/pkg/lightning/worker" "github.com/pingcap/br/pkg/pdutil" + "github.com/pingcap/br/pkg/redact" "github.com/pingcap/br/pkg/storage" "github.com/pingcap/br/pkg/utils" "github.com/pingcap/br/pkg/version" @@ -306,15 +307,21 @@ func NewRestoreControllerWithPauser( ts = oracle.ComposeTS(physical, logical) } - // TODO: support Lightning via SQL - db, err := g.GetDB() - if err != nil { - return nil, errors.Trace(err) - } - metaBuilder := &dbMetaMgrBuilder{ - db: db, - taskID: cfg.TaskID, - schema: cfg.App.MetaSchemaName, + var metaBuilder metaMgrBuilder + switch cfg.TikvImporter.Backend { + case config.BackendLocal, config.BackendImporter: + // TODO: support Lightning via SQL + db, err := g.GetDB() + if err != nil { + return nil, errors.Trace(err) + } + metaBuilder = &dbMetaMgrBuilder{ + db: db, + taskID: cfg.TaskID, + schema: cfg.App.MetaSchemaName, + } + default: + metaBuilder = noopMetaMgrBuilder{} } rc := &Controller{ @@ -724,25 +731,6 @@ func (rc *Controller) restoreSchema(ctx context.Context) error { rc.sysVars = ObtainImportantVariables(ctx, rc.tidbGlue.GetSQLExecutor()) - // TODO: maybe we should not create this table here since user may not have write permission to the `mysql` db. - // ensure meta table exists - if rc.cfg.TikvImporter.Backend != config.BackendTiDB { - exec := rc.tidbGlue.GetSQLExecutor() - logger := log.L() - metaDBSQL := fmt.Sprintf("CREATE DATABASE IF NOT EXISTS `%s`", rc.cfg.App.MetaSchemaName) - if err := exec.ExecuteWithLog(ctx, metaDBSQL, "create meta schema", logger); err != nil { - return errors.Annotate(err, "create meta schema failed") - } - taskMetaSQL := fmt.Sprintf(CreateTaskMetaTable, rc.cfg.App.MetaSchemaName, taskMetaTableName) - if err := exec.ExecuteWithLog(ctx, taskMetaSQL, "create meta table", log.L()); err != nil { - return errors.Annotate(err, "create task meta table failed") - } - tableMetaSQL := fmt.Sprintf(CreateTableMetadataTable, rc.cfg.App.MetaSchemaName, tableMetaTableName) - if err := exec.ExecuteWithLog(ctx, tableMetaSQL, "create meta table", log.L()); err != nil { - return errors.Annotate(err, "create table meta table failed") - } - } - // Estimate the number of chunks for progress reporting err = rc.estimateChunkCountIntoMetrics(ctx) return err @@ -1164,6 +1152,10 @@ var checksumManagerKey struct{} func (rc *Controller) restoreTables(ctx context.Context) error { logTask := log.L().Begin(zap.InfoLevel, "restore all tables data") + if err := rc.metaMgrBuilder.Init(ctx); err != nil { + return err + } + // for local backend, we should disable some pd scheduler and change some settings, to // make split region and ingest sst more stable // because importer backend is mostly use for v3.x cluster which doesn't support these api, @@ -2968,6 +2960,7 @@ func (cr *chunkRestore) restore( } type metaMgrBuilder interface { + Init(ctx context.Context) error TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr TableMetaMgr(tr *TableRestore) tableMetaMgr } @@ -2978,6 +2971,27 @@ type dbMetaMgrBuilder struct { schema string } +func (b *dbMetaMgrBuilder) Init(ctx context.Context) error { + exec := common.SQLWithRetry{ + DB: b.db, + Logger: log.L(), + HideQueryLog: redact.NeedRedact(), + } + metaDBSQL := fmt.Sprintf("CREATE DATABASE IF NOT EXISTS `%s`", b.schema) + if err := exec.Exec(ctx, "create meta schema", metaDBSQL); err != nil { + return errors.Annotate(err, "create meta schema failed") + } + taskMetaSQL := fmt.Sprintf(CreateTaskMetaTable, b.schema, taskMetaTableName) + if err := exec.Exec(ctx, "create meta table", taskMetaSQL); err != nil { + return errors.Annotate(err, "create task meta table failed") + } + tableMetaSQL := fmt.Sprintf(CreateTableMetadataTable, b.schema, tableMetaTableName) + if err := exec.Exec(ctx, "create meta table", tableMetaSQL); err != nil { + return errors.Annotate(err, "create table meta table failed") + } + return nil +} + func (b *dbMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr { return &dbTaskMetaMgr{ session: b.db, @@ -3655,3 +3669,67 @@ func (m *dbTaskMetaMgr) CleanupAllMetas(ctx context.Context) error { } return nil } + +type noopMetaMgrBuilder struct{} + +func (b noopMetaMgrBuilder) Init(ctx context.Context) error { + return nil +} + +func (b noopMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr { + return noopTaskMetaMgr{} +} + +func (b noopMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr { + return noopTableMetaMgr{} +} + +type noopTaskMetaMgr struct{} + +func (m noopTaskMetaMgr) InitTask(ctx context.Context) error { + return nil +} + +func (m noopTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { + return func(ctx context.Context) error { + return nil + }, nil +} + +func (m noopTaskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { + return false, nil +} + +func (m noopTaskMetaMgr) Cleanup(ctx context.Context) error { + return nil +} + +func (m noopTaskMetaMgr) CleanupAllMetas(ctx context.Context) error { + return nil +} + +type noopTableMetaMgr struct{} + +func (m noopTableMetaMgr) InitTableMeta(ctx context.Context) error { + return nil +} + +func (m noopTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) { + return nil, 0, nil +} + +func (m noopTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error { + return nil +} + +func (m noopTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error { + return nil +} + +func (m noopTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { + return false, nil, nil +} + +func (m noopTableMetaMgr) FinishTable(ctx context.Context) error { + return nil +} diff --git a/pkg/lightning/restore/restore_test.go b/pkg/lightning/restore/restore_test.go index 48a2ec650..8ec8bfdd7 100644 --- a/pkg/lightning/restore/restore_test.go +++ b/pkg/lightning/restore/restore_test.go @@ -54,7 +54,6 @@ import ( "github.com/pingcap/br/pkg/lightning/web" "github.com/pingcap/br/pkg/lightning/worker" "github.com/pingcap/br/pkg/mock" - "github.com/pingcap/br/pkg/pdutil" "github.com/pingcap/br/pkg/storage" "github.com/pingcap/br/pkg/version/build" ) @@ -893,7 +892,7 @@ func (s *tableRestoreSuite) TestTableRestoreMetrics(c *C) { checkpointsDB: cpDB, closedEngineLimit: worker.NewPool(ctx, 1, "closed_engine"), store: s.store, - metaMgrBuilder: testMetaMgrBuilder{}, + metaMgrBuilder: noopMetaMgrBuilder{}, } go func() { for range chptCh { @@ -921,66 +920,6 @@ func (s *tableRestoreSuite) TestTableRestoreMetrics(c *C) { c.Assert(tableFinished-tableFinishedBase, Equals, float64(1)) } -type testMetaMgrBuilder struct{} - -func (b testMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr { - return testTaskMetaMgr{} -} - -func (b testMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr { - return testTableMetaMgr{} -} - -type testTaskMetaMgr struct{} - -func (m testTaskMetaMgr) InitTask(ctx context.Context) error { - return nil -} - -func (m testTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { - return func(ctx context.Context) error { - return nil - }, nil -} - -func (m testTaskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { - return false, nil -} - -func (m testTaskMetaMgr) Cleanup(ctx context.Context) error { - return nil -} - -func (m testTaskMetaMgr) CleanupAllMetas(ctx context.Context) error { - return nil -} - -type testTableMetaMgr struct{} - -func (m testTableMetaMgr) InitTableMeta(ctx context.Context) error { - return nil -} - -func (m testTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verification.KVChecksum, int64, error) { - return nil, 0, nil -} - -func (m testTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error { - return nil -} - -func (m testTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verification.KVChecksum) error { - return nil -} - -func (m testTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verification.KVChecksum) (bool, *verification.KVChecksum, error) { - return false, nil, nil -} - -func (m testTableMetaMgr) FinishTable(ctx context.Context) error { - return nil -} - var _ = Suite(&chunkRestoreSuite{}) type chunkRestoreSuite struct { From 953aa9c2967775f0d4d199bf885d168824822bd2 Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 25 May 2021 11:39:02 +0800 Subject: [PATCH 25/32] remove table empty check --- pkg/lightning/restore/restore.go | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 03dc9a129..9c278256a 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -692,31 +692,6 @@ func (rc *Controller) restoreSchema(ctx context.Context) error { } rc.dbInfos = dbInfos - if rc.cfg.TikvImporter.Backend != config.BackendTiDB { - for _, dbMeta := range rc.dbMetas { - for _, tableMeta := range dbMeta.Tables { - tableName := common.UniqueTable(dbMeta.Name, tableMeta.Name) - - // if checkpoint enable and not missing, we skip the check table empty progress. - if rc.cfg.Checkpoint.Enable { - _, err := rc.checkpointsDB.Get(ctx, tableName) - switch { - case err == nil: - continue - case errors.IsNotFound(err): - default: - return err - } - } - - err := rc.checkTableEmpty(ctx, tableName) - if err != nil { - return err - } - } - } - } - // Load new checkpoints err = rc.checkpointsDB.Initialize(ctx, rc.cfg, dbInfos) if err != nil { From 8da7fd49f8d8d499637c7d1dcd72b5829e8f67ff Mon Sep 17 00:00:00 2001 From: glorv Date: Tue, 25 May 2021 14:17:06 +0800 Subject: [PATCH 26/32] fix schedulers --- pkg/lightning/restore/restore.go | 13 ++++++++++--- pkg/pdutil/pd.go | 8 ++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 9c278256a..df70ff17b 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -3437,6 +3437,7 @@ func (m *dbTaskMetaMgr) InitTask(ctx context.Context) error { } func (m *dbTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { + pauseCtx, cancel := context.WithCancel(ctx) conn, err := m.session.Conn(ctx) if err != nil { return nil, errors.Trace(err) @@ -3507,7 +3508,7 @@ func (m *dbTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.U return errors.Trace(err) } - orig, removed, err := m.pd.RemoveSchedulersAndReturn(ctx) + orig, removed, err := m.pd.RemoveSchedulersWithOrigin(pauseCtx) if err != nil { return errors.Trace(err) } @@ -3533,12 +3534,18 @@ func (m *dbTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.U } if !paused { - if err = m.pd.RemoveSchedulersWithCfg(ctx, pausedCfg.PauseCfg); err != nil { + if err = m.pd.RemoveSchedulersWithCfg(pauseCtx, pausedCfg.PauseCfg); err != nil { return nil, err } } - return m.pd.MakeUndoFunctionByConfig(pausedCfg.RestoreCfg), nil + cancelFunc := m.pd.MakeUndoFunctionByConfig(pausedCfg.RestoreCfg) + + return func(ctx context.Context) error { + // close the periodic task ctx + cancel() + return cancelFunc(ctx) + }, nil } func (m *dbTaskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { diff --git a/pkg/pdutil/pd.go b/pkg/pdutil/pd.go index eac153e65..63276804b 100644 --- a/pkg/pdutil/pd.go +++ b/pkg/pdutil/pd.go @@ -566,18 +566,18 @@ func (p *PdController) MakeUndoFunctionByConfig(config ClusterConfig) UndoFunc { func (p *PdController) RemoveSchedulers(ctx context.Context) (undo UndoFunc, err error) { undo = Nop - _, removed, err1 := p.RemoveSchedulersAndReturn(ctx) + origin, _, err1 := p.RemoveSchedulersWithOrigin(ctx) if err1 != nil { err = err1 return } - undo = p.MakeUndoFunctionByConfig(ClusterConfig{Schedulers: removed.Schedulers, ScheduleCfg: removed.ScheduleCfg}) + undo = p.MakeUndoFunctionByConfig(ClusterConfig{Schedulers: origin.Schedulers, ScheduleCfg: origin.ScheduleCfg}) return undo, errors.Trace(err) } -// RemoveSchedulersAndReturn pause and remove br related schedule configs and return the origin and modified configs -func (p *PdController) RemoveSchedulersAndReturn(ctx context.Context) (ClusterConfig, ClusterConfig, error) { +// RemoveSchedulersWithOrigin pause and remove br related schedule configs and return the origin and modified configs +func (p *PdController) RemoveSchedulersWithOrigin(ctx context.Context) (ClusterConfig, ClusterConfig, error) { if span := opentracing.SpanFromContext(ctx); span != nil && span.Tracer() != nil { span1 := span.Tracer().StartSpan("PdController.RemoveSchedulers", opentracing.ChildOf(span.Context())) defer span1.Finish() From dcf1b7d949d271279ef99abce4be8c2bc97b2e18 Mon Sep 17 00:00:00 2001 From: glorv Date: Wed, 26 May 2021 11:03:14 +0800 Subject: [PATCH 27/32] add importer backend --- pkg/lightning/restore/restore.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index df70ff17b..246182b4f 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -1443,7 +1443,7 @@ func (tr *TableRestore) restoreTable( } // "show table next_row_id" is only available after v4.0.0 - if tidbVersion.Major >= 4 && rc.cfg.TikvImporter.Backend == config.BackendLocal { + if tidbVersion.Major >= 4 && (rc.cfg.TikvImporter.Backend == config.BackendLocal || rc.cfg.TikvImporter.Backend == config.BackendImporter) { // first, insert a new-line into meta table if err = metaMgr.InitTableMeta(ctx); err != nil { return false, err From 965c7eb5432a7c2e9fea011cc0907c1bfdef124d Mon Sep 17 00:00:00 2001 From: glorv Date: Thu, 3 Jun 2021 14:36:56 +0800 Subject: [PATCH 28/32] return error if target table is in checksum phase --- pkg/lightning/restore/restore.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 246182b4f..50dab6423 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -3114,6 +3114,10 @@ func (m *dbTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64 continue } + if status == metaStatusChecksuming { + return errors.New("target table is calculating checksum, please wait unit the checksum is finished and try again.") + } + if metaTaskID == m.taskID { curStatus = status baseChecksum = checksum From 059175af6d118b95632c88ddab98862dc5382f44 Mon Sep 17 00:00:00 2001 From: glorv Date: Thu, 3 Jun 2021 18:55:27 +0800 Subject: [PATCH 29/32] resolve comments --- pkg/lightning/checkpoints/checkpoints.go | 6 +- pkg/lightning/common/util.go | 7 + pkg/lightning/restore/meta_manager.go | 807 +++++++++++++++++++++ pkg/lightning/restore/meta_manager_test.go | 242 ++++++ pkg/lightning/restore/restore.go | 793 +------------------- pkg/lightning/restore/restore_test.go | 247 +------ 6 files changed, 1070 insertions(+), 1032 deletions(-) create mode 100644 pkg/lightning/restore/meta_manager.go create mode 100644 pkg/lightning/restore/meta_manager_test.go diff --git a/pkg/lightning/checkpoints/checkpoints.go b/pkg/lightning/checkpoints/checkpoints.go index ac2650033..d412553e5 100644 --- a/pkg/lightning/checkpoints/checkpoints.go +++ b/pkg/lightning/checkpoints/checkpoints.go @@ -23,7 +23,6 @@ import ( "math" "os" "sort" - "strings" "sync" "github.com/joho/sqltocsv" @@ -609,10 +608,7 @@ type MySQLCheckpointsDB struct { } func NewMySQLCheckpointsDB(ctx context.Context, db *sql.DB, schemaName string) (*MySQLCheckpointsDB, error) { - var escapedSchemaName strings.Builder - common.WriteMySQLIdentifier(&escapedSchemaName, schemaName) - schema := escapedSchemaName.String() - + schema := common.EscapeIdentifier(schemaName) sql := common.SQLWithRetry{ DB: db, Logger: log.With(zap.String("schema", schemaName)), diff --git a/pkg/lightning/common/util.go b/pkg/lightning/common/util.go index c04985d5a..c0ea2622e 100644 --- a/pkg/lightning/common/util.go +++ b/pkg/lightning/common/util.go @@ -274,6 +274,13 @@ func UniqueTable(schema string, table string) string { return builder.String() } +// EscapeIdentifier quote and escape an sql identifier +func EscapeIdentifier(identifier string) string { + var builder strings.Builder + WriteMySQLIdentifier(&builder, identifier) + return builder.String() +} + // Writes a MySQL identifier into the string builder. // The identifier is always escaped into the form "`foo`". func WriteMySQLIdentifier(builder *strings.Builder, identifier string) { diff --git a/pkg/lightning/restore/meta_manager.go b/pkg/lightning/restore/meta_manager.go new file mode 100644 index 000000000..50fe0640c --- /dev/null +++ b/pkg/lightning/restore/meta_manager.go @@ -0,0 +1,807 @@ +package restore + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + "strings" + + "github.com/pingcap/errors" + "github.com/pingcap/parser/model" + "github.com/pingcap/parser/mysql" + "go.uber.org/zap" + + "github.com/pingcap/br/pkg/lightning/backend/tidb" + "github.com/pingcap/br/pkg/lightning/common" + "github.com/pingcap/br/pkg/lightning/log" + verify "github.com/pingcap/br/pkg/lightning/verification" + "github.com/pingcap/br/pkg/pdutil" + "github.com/pingcap/br/pkg/redact" +) + +type metaMgrBuilder interface { + Init(ctx context.Context) error + TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr + TableMetaMgr(tr *TableRestore) tableMetaMgr +} + +type dbMetaMgrBuilder struct { + db *sql.DB + taskID int64 + schema string +} + +func (b *dbMetaMgrBuilder) Init(ctx context.Context) error { + exec := common.SQLWithRetry{ + DB: b.db, + Logger: log.L(), + HideQueryLog: redact.NeedRedact(), + } + metaDBSQL := fmt.Sprintf("CREATE DATABASE IF NOT EXISTS %s", common.EscapeIdentifier(b.schema)) + if err := exec.Exec(ctx, "create meta schema", metaDBSQL); err != nil { + return errors.Annotate(err, "create meta schema failed") + } + taskMetaSQL := fmt.Sprintf(CreateTaskMetaTable, common.UniqueTable(b.schema, taskMetaTableName)) + if err := exec.Exec(ctx, "create meta table", taskMetaSQL); err != nil { + return errors.Annotate(err, "create task meta table failed") + } + tableMetaSQL := fmt.Sprintf(CreateTableMetadataTable, common.UniqueTable(b.schema, tableMetaTableName)) + if err := exec.Exec(ctx, "create meta table", tableMetaSQL); err != nil { + return errors.Annotate(err, "create table meta table failed") + } + return nil +} + +func (b *dbMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr { + return &dbTaskMetaMgr{ + session: b.db, + taskID: b.taskID, + pd: pd, + tableName: common.UniqueTable(b.schema, taskMetaTableName), + schemaName: b.schema, + } +} + +func (b *dbMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr { + return &dbTableMetaMgr{ + session: b.db, + taskID: b.taskID, + tr: tr, + tableName: common.UniqueTable(b.schema, tableMetaTableName), + } +} + +type tableMetaMgr interface { + InitTableMeta(ctx context.Context) error + AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) + UpdateTableStatus(ctx context.Context, status metaStatus) error + UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error + CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) + FinishTable(ctx context.Context) error +} + +type dbTableMetaMgr struct { + session *sql.DB + taskID int64 + tr *TableRestore + tableName string +} + +func (m *dbTableMetaMgr) InitTableMeta(ctx context.Context) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: m.tr.logger, + } + // avoid override existing metadata if the meta is already inserted. + stmt := fmt.Sprintf(`INSERT IGNORE INTO %s (task_id, table_id, table_name, status) values (?, ?, ?, ?)`, m.tableName) + task := m.tr.logger.Begin(zap.DebugLevel, "init table meta") + err := exec.Exec(ctx, "init table meta", stmt, m.taskID, m.tr.tableInfo.ID, m.tr.tableName, metaStatusInitial.String()) + task.End(zap.ErrorLevel, err) + return errors.Trace(err) +} + +type metaStatus uint32 + +const ( + metaStatusInitial metaStatus = iota + metaStatusRowIDAllocated + metaStatusRestoreStarted + metaStatusRestoreFinished + metaStatusChecksuming + metaStatusChecksumSkipped + metaStatusFinished +) + +func (m metaStatus) String() string { + switch m { + case metaStatusInitial: + return "initialized" + case metaStatusRowIDAllocated: + return "allocated" + case metaStatusRestoreStarted: + return "restore" + case metaStatusRestoreFinished: + return "restore_finished" + case metaStatusChecksuming: + return "checksuming" + case metaStatusChecksumSkipped: + return "checksum_skipped" + case metaStatusFinished: + return "finish" + default: + panic(fmt.Sprintf("unexpected metaStatus value '%d'", m)) + } +} + +func parseMetaStatus(s string) (metaStatus, error) { + switch s { + case "", "initialized": + return metaStatusInitial, nil + case "allocated": + return metaStatusRowIDAllocated, nil + case "restore": + return metaStatusRestoreStarted, nil + case "restore_finished": + return metaStatusRestoreFinished, nil + case "checksuming": + return metaStatusChecksuming, nil + case "checksum_skipped": + return metaStatusChecksumSkipped, nil + case "finish": + return metaStatusFinished, nil + default: + return metaStatusInitial, errors.Errorf("invalid meta status '%s'", s) + } +} + +func (m *dbTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) { + conn, err := m.session.Conn(ctx) + if err != nil { + return nil, 0, errors.Trace(err) + } + defer conn.Close() + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: m.tr.logger, + } + var newRowIDBase, newRowIDMax int64 + curStatus := metaStatusInitial + newStatus := metaStatusRowIDAllocated + var baseTotalKvs, baseTotalBytes, baseChecksum uint64 + err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") + if err != nil { + return nil, 0, errors.Annotate(err, "enable pessimistic transaction failed") + } + needAutoID := common.TableHasAutoRowID(m.tr.tableInfo.Core) || m.tr.tableInfo.Core.GetAutoIncrementColInfo() != nil || m.tr.tableInfo.Core.ContainsAutoRandomBits() + err = exec.Transact(ctx, "init table allocator base", func(ctx context.Context, tx *sql.Tx) error { + query := fmt.Sprintf("SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from %s WHERE table_id = ? FOR UPDATE", m.tableName) + rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) + if err != nil { + return errors.Trace(err) + } + defer rows.Close() + var ( + metaTaskID, rowIDBase, rowIDMax, maxRowIDMax int64 + totalKvs, totalBytes, checksum uint64 + statusValue string + ) + for rows.Next() { + if err = rows.Scan(&metaTaskID, &rowIDBase, &rowIDMax, &totalKvs, &totalBytes, &checksum, &statusValue); err != nil { + return errors.Trace(err) + } + status, err := parseMetaStatus(statusValue) + if err != nil { + return errors.Annotatef(err, "invalid meta status '%s'", statusValue) + } + + // skip finished meta + if status >= metaStatusFinished { + continue + } + + if status == metaStatusChecksuming { + return errors.New("target table is calculating checksum, please wait unit the checksum is finished and try again.") + } + + if metaTaskID == m.taskID { + curStatus = status + baseChecksum = checksum + baseTotalKvs = totalKvs + baseTotalBytes = totalBytes + if status >= metaStatusRowIDAllocated { + if rowIDMax-rowIDBase != rawRowIDMax { + return errors.Errorf("verify allocator base failed. local: '%d', meta: '%d'", rawRowIDMax, rowIDMax-rowIDBase) + } + newRowIDBase = rowIDBase + newRowIDMax = rowIDMax + break + } + continue + } + + // other tasks has finished this logic, we needn't do again. + if status >= metaStatusRowIDAllocated { + newStatus = metaStatusRestoreStarted + } + + if rowIDMax > maxRowIDMax { + maxRowIDMax = rowIDMax + } + } + + // no enough info are available, fetch row_id max for table + if curStatus == metaStatusInitial { + if needAutoID && maxRowIDMax == 0 { + // NOTE: currently, if a table contains auto_incremental unique key and _tidb_rowid, + // the `show table next_row_id` will returns the unique key field only. + var autoIDField string + for _, col := range m.tr.tableInfo.Core.Columns { + if mysql.HasAutoIncrementFlag(col.Flag) { + autoIDField = col.Name.L + break + } else if mysql.HasPriKeyFlag(col.Flag) && m.tr.tableInfo.Core.AutoRandomBits > 0 { + autoIDField = col.Name.L + break + } + } + if len(autoIDField) == 0 && common.TableHasAutoRowID(m.tr.tableInfo.Core) { + autoIDField = model.ExtraHandleName.L + } + if len(autoIDField) == 0 { + return errors.Errorf("table %s contains auto increment id or _tidb_rowid, but target field not found", m.tr.tableName) + } + + autoIDInfos, err := tidb.FetchTableAutoIDInfos(ctx, tx, m.tr.tableName) + if err != nil { + return errors.Trace(err) + } + found := false + for _, info := range autoIDInfos { + if strings.ToLower(info.Column) == autoIDField { + maxRowIDMax = info.NextID - 1 + found = true + break + } + } + if !found { + return errors.Errorf("can't fetch previous auto id base for table %s field '%s'", m.tr.tableName, autoIDField) + } + } + newRowIDBase = maxRowIDMax + newRowIDMax = newRowIDBase + rawRowIDMax + // table contains no data, can skip checksum + if needAutoID && newRowIDBase == 0 && newStatus < metaStatusRestoreStarted { + newStatus = metaStatusRestoreStarted + } + query = fmt.Sprintf("update %s set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?", m.tableName) + _, err := tx.ExecContext(ctx, query, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, m.taskID) + if err != nil { + return errors.Trace(err) + } + + curStatus = newStatus + } + return nil + }) + if err != nil { + return nil, 0, errors.Trace(err) + } + + var checksum *verify.KVChecksum + // need to do checksum and update checksum meta since we are the first one. + if curStatus < metaStatusRestoreStarted { + // table contains data but haven't do checksum yet + if (newRowIDBase > 0 || !needAutoID) && baseTotalKvs == 0 { + remoteCk, err := DoChecksum(ctx, m.tr.tableInfo) + if err != nil { + return nil, 0, errors.Trace(err) + } + + if remoteCk.Checksum != baseChecksum || remoteCk.TotalKVs != baseTotalKvs || remoteCk.TotalBytes != baseTotalBytes { + ck := verify.MakeKVChecksum(remoteCk.TotalBytes, remoteCk.TotalKVs, remoteCk.Checksum) + checksum = &ck + } + + } + + if checksum != nil { + if err = m.UpdateTableBaseChecksum(ctx, checksum); err != nil { + return nil, 0, errors.Trace(err) + } + + m.tr.logger.Info("checksum before restore table", zap.Object("checksum", checksum)) + } else if err = m.UpdateTableStatus(ctx, metaStatusRestoreStarted); err != nil { + return nil, 0, errors.Trace(err) + } + } + if checksum == nil && baseTotalKvs > 0 { + ck := verify.MakeKVChecksum(baseTotalBytes, baseTotalKvs, baseChecksum) + checksum = &ck + } + log.L().Info("allocate table row_id base", zap.String("table", m.tr.tableName), + zap.Int64("row_id_base", newRowIDBase)) + if checksum != nil { + log.L().Info("checksum base", zap.Any("checksum", checksum)) + } + return checksum, newRowIDBase, nil +} + +func (m *dbTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: m.tr.logger, + } + query := fmt.Sprintf("update %s set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?", m.tableName) + + return exec.Exec(ctx, "update base checksum", query, checksum.SumKVS(), + checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), m.tr.tableInfo.ID, m.taskID) +} + +func (m *dbTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: m.tr.logger, + } + query := fmt.Sprintf("update %s set status = ? where table_id = ? and task_id = ?", m.tableName) + return exec.Exec(ctx, "update meta status", query, status.String(), m.tr.tableInfo.ID, m.taskID) +} + +func (m *dbTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { + conn, err := m.session.Conn(ctx) + if err != nil { + return false, nil, errors.Trace(err) + } + defer conn.Close() + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: m.tr.logger, + } + err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") + if err != nil { + return false, nil, errors.Annotate(err, "enable pessimistic transaction failed") + } + var ( + baseTotalKvs, baseTotalBytes, baseChecksum uint64 + taskKvs, taskBytes, taskChecksum uint64 + totalKvs, totalBytes, totalChecksum uint64 + ) + newStatus := metaStatusChecksuming + needChecksum := true + err = exec.Transact(ctx, "checksum pre-check", func(ctx context.Context, tx *sql.Tx) error { + query := fmt.Sprintf("SELECT task_id, total_kvs_base, total_bytes_base, checksum_base, total_kvs, total_bytes, checksum, status from %s WHERE table_id = ? FOR UPDATE", m.tableName) + rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) + if err != nil { + return errors.Annotate(err, "fetch task meta failed") + } + closed := false + defer func() { + if !closed { + rows.Close() + } + }() + var ( + taskID int64 + statusValue string + ) + for rows.Next() { + if err = rows.Scan(&taskID, &baseTotalKvs, &baseTotalBytes, &baseChecksum, &taskKvs, &taskBytes, &taskChecksum, &statusValue); err != nil { + return errors.Trace(err) + } + status, err := parseMetaStatus(statusValue) + if err != nil { + return errors.Annotatef(err, "invalid meta status '%s'", statusValue) + } + + // skip finished meta + if status >= metaStatusFinished { + continue + } + + if taskID == m.taskID { + if status >= metaStatusChecksuming { + newStatus = status + needChecksum = status == metaStatusChecksuming + return nil + } + + continue + } + + if status < metaStatusChecksuming { + newStatus = metaStatusChecksumSkipped + needChecksum = false + break + } else if status == metaStatusChecksuming { + return errors.New("another task is checksuming, there must be something wrong!") + } + + totalBytes += baseTotalBytes + totalKvs += baseTotalKvs + totalChecksum ^= baseChecksum + + totalBytes += taskBytes + totalKvs += taskKvs + totalChecksum ^= taskChecksum + } + rows.Close() + closed = true + + query = fmt.Sprintf("update %s set total_kvs = ?, total_bytes = ?, checksum = ?, status = ? where table_id = ? and task_id = ?", m.tableName) + _, err = tx.ExecContext(ctx, query, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus.String(), m.tr.tableInfo.ID, m.taskID) + return errors.Annotate(err, "update local checksum failed") + }) + if err != nil { + return false, nil, err + } + + var remoteChecksum *verify.KVChecksum + if needChecksum { + ck := verify.MakeKVChecksum(totalBytes, totalKvs, totalChecksum) + remoteChecksum = &ck + } + log.L().Info("check table checksum", zap.String("table", m.tr.tableName), + zap.Bool("checksum", needChecksum), zap.String("new_status", newStatus.String())) + return needChecksum, remoteChecksum, nil +} + +func (m *dbTableMetaMgr) FinishTable(ctx context.Context) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: m.tr.logger, + } + query := fmt.Sprintf("DELETE FROM %s where table_id = ? and (status = 'checksuming' or status = 'checksum_skipped')", m.tableName) + return exec.Exec(ctx, "clean up metas", query, m.tr.tableInfo.ID) +} + +type taskMetaMgr interface { + InitTask(ctx context.Context) error + CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) + CheckAndFinishRestore(ctx context.Context) (bool, error) + Cleanup(ctx context.Context) error + CleanupAllMetas(ctx context.Context) error +} + +type dbTaskMetaMgr struct { + session *sql.DB + taskID int64 + pd *pdutil.PdController + // unique name of task meta table + tableName string + schemaName string +} + +type taskMetaStatus uint32 + +const ( + taskMetaStatusInitial taskMetaStatus = iota + taskMetaStatusScheduleSet + taskMetaStatusSwitchSkipped + taskMetaStatusSwitchBack +) + +func (m taskMetaStatus) String() string { + switch m { + case taskMetaStatusInitial: + return "initialized" + case taskMetaStatusScheduleSet: + return "schedule_set" + case taskMetaStatusSwitchSkipped: + return "skip_switch" + case taskMetaStatusSwitchBack: + return "switched" + default: + panic(fmt.Sprintf("unexpected metaStatus value '%d'", m)) + } +} + +func parseTaskMetaStatus(s string) (taskMetaStatus, error) { + switch s { + case "", "initialized": + return taskMetaStatusInitial, nil + case "schedule_set": + return taskMetaStatusScheduleSet, nil + case "skip_switch": + return taskMetaStatusSwitchSkipped, nil + case "switched": + return taskMetaStatusSwitchBack, nil + default: + return taskMetaStatusInitial, errors.Errorf("invalid meta status '%s'", s) + } +} + +type storedCfgs struct { + PauseCfg pdutil.ClusterConfig `json:"paused"` + RestoreCfg pdutil.ClusterConfig `json:"restore"` +} + +func (m *dbTaskMetaMgr) InitTask(ctx context.Context) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: log.L(), + } + // avoid override existing metadata if the meta is already inserted. + stmt := fmt.Sprintf(`INSERT IGNORE INTO %s (task_id, status) values (?, ?)`, m.tableName) + err := exec.Exec(ctx, "init task meta", stmt, m.taskID, taskMetaStatusInitial.String()) + return errors.Trace(err) +} + +func (m *dbTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { + pauseCtx, cancel := context.WithCancel(ctx) + conn, err := m.session.Conn(ctx) + if err != nil { + return nil, errors.Trace(err) + } + defer conn.Close() + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: log.L(), + } + err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") + if err != nil { + return nil, errors.Annotate(err, "enable pessimistic transaction failed") + } + + needSwitch := true + paused := false + var pausedCfg storedCfgs + err = exec.Transact(ctx, "check and pause schedulers", func(ctx context.Context, tx *sql.Tx) error { + query := fmt.Sprintf("SELECT task_id, pd_cfgs, status from %s FOR UPDATE", m.tableName) + rows, err := tx.QueryContext(ctx, query) + if err != nil { + return errors.Annotate(err, "fetch task meta failed") + } + closed := false + defer func() { + if !closed { + rows.Close() + } + }() + var ( + taskID int64 + cfg string + statusValue string + ) + var cfgStr string + for rows.Next() { + if err = rows.Scan(&taskID, &cfg, &statusValue); err != nil { + return errors.Trace(err) + } + status, err := parseTaskMetaStatus(statusValue) + if err != nil { + return errors.Annotatef(err, "invalid task meta status '%s'", statusValue) + } + + if status == taskMetaStatusInitial { + continue + } + + if taskID == m.taskID { + if status >= taskMetaStatusSwitchSkipped { + needSwitch = false + return nil + } + } + + if cfg != "" { + cfgStr = cfg + break + } + } + if err = rows.Close(); err != nil { + return errors.Trace(err) + } + closed = true + + if cfgStr != "" { + err = json.Unmarshal([]byte(cfgStr), &pausedCfg) + return errors.Trace(err) + } + + orig, removed, err := m.pd.RemoveSchedulersWithOrigin(pauseCtx) + if err != nil { + return errors.Trace(err) + } + paused = true + + pausedCfg = storedCfgs{PauseCfg: removed, RestoreCfg: orig} + jsonByts, err := json.Marshal(&pausedCfg) + if err != nil { + return errors.Trace(err) + } + + query = fmt.Sprintf("update %s set pd_cfgs = ?, status = ? where task_id = ?", m.tableName) + _, err = tx.ExecContext(ctx, query, string(jsonByts), taskMetaStatusScheduleSet.String(), m.taskID) + + return errors.Annotate(err, "update task pd configs failed") + }) + if err != nil { + return nil, err + } + + if !needSwitch { + return nil, nil + } + + if !paused { + if err = m.pd.RemoveSchedulersWithCfg(pauseCtx, pausedCfg.PauseCfg); err != nil { + return nil, err + } + } + + cancelFunc := m.pd.MakeUndoFunctionByConfig(pausedCfg.RestoreCfg) + + return func(ctx context.Context) error { + // close the periodic task ctx + cancel() + return cancelFunc(ctx) + }, nil +} + +func (m *dbTaskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { + conn, err := m.session.Conn(ctx) + if err != nil { + return false, errors.Trace(err) + } + defer conn.Close() + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: log.L(), + } + err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") + if err != nil { + return false, errors.Annotate(err, "enable pessimistic transaction failed") + } + + switchBack := true + err = exec.Transact(ctx, "check and finish schedulers", func(ctx context.Context, tx *sql.Tx) error { + query := fmt.Sprintf("SELECT task_id, status from %s FOR UPDATE", m.tableName) + rows, err := tx.QueryContext(ctx, query) + if err != nil { + return errors.Annotate(err, "fetch task meta failed") + } + closed := false + defer func() { + if !closed { + rows.Close() + } + }() + var ( + taskID int64 + statusValue string + ) + newStatus := taskMetaStatusSwitchBack + for rows.Next() { + if err = rows.Scan(&taskID, &statusValue); err != nil { + return errors.Trace(err) + } + status, err := parseTaskMetaStatus(statusValue) + if err != nil { + return errors.Annotatef(err, "invalid task meta status '%s'", statusValue) + } + + if taskID == m.taskID { + continue + } + + if status < taskMetaStatusSwitchSkipped { + newStatus = taskMetaStatusSwitchSkipped + switchBack = false + break + } + } + if err = rows.Close(); err != nil { + return errors.Trace(err) + } + closed = true + + query = fmt.Sprintf("update %s set status = ? where task_id = ?", m.tableName) + _, err = tx.ExecContext(ctx, query, newStatus.String(), m.taskID) + + return errors.Trace(err) + }) + + return switchBack, err +} + +func (m *dbTaskMetaMgr) Cleanup(ctx context.Context) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: log.L(), + } + // avoid override existing metadata if the meta is already inserted. + stmt := fmt.Sprintf("DROP TABLE %s;", m.tableName) + if err := exec.Exec(ctx, "cleanup task meta tables", stmt); err != nil { + return errors.Trace(err) + } + return nil +} + +func (m *dbTaskMetaMgr) CleanupAllMetas(ctx context.Context) error { + exec := &common.SQLWithRetry{ + DB: m.session, + Logger: log.L(), + } + + // check if all tables are finished + query := fmt.Sprintf("SELECT COUNT(*) from %s", common.UniqueTable(m.schemaName, tableMetaTableName)) + var cnt int + if err := exec.QueryRow(ctx, "fetch table meta row count", query, &cnt); err != nil { + return errors.Trace(err) + } + if cnt > 0 { + log.L().Warn("there are unfinished table in table meta table, cleanup skipped.") + return nil + } + + // avoid override existing metadata if the meta is already inserted. + stmt := fmt.Sprintf("DROP DATABASE %s;", common.EscapeIdentifier(m.schemaName)) + if err := exec.Exec(ctx, "cleanup task meta tables", stmt); err != nil { + return errors.Trace(err) + } + return nil +} + +type noopMetaMgrBuilder struct{} + +func (b noopMetaMgrBuilder) Init(ctx context.Context) error { + return nil +} + +func (b noopMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr { + return noopTaskMetaMgr{} +} + +func (b noopMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr { + return noopTableMetaMgr{} +} + +type noopTaskMetaMgr struct{} + +func (m noopTaskMetaMgr) InitTask(ctx context.Context) error { + return nil +} + +func (m noopTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { + return func(ctx context.Context) error { + return nil + }, nil +} + +func (m noopTaskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { + return false, nil +} + +func (m noopTaskMetaMgr) Cleanup(ctx context.Context) error { + return nil +} + +func (m noopTaskMetaMgr) CleanupAllMetas(ctx context.Context) error { + return nil +} + +type noopTableMetaMgr struct{} + +func (m noopTableMetaMgr) InitTableMeta(ctx context.Context) error { + return nil +} + +func (m noopTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) { + return nil, 0, nil +} + +func (m noopTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error { + return nil +} + +func (m noopTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error { + return nil +} + +func (m noopTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { + return false, nil, nil +} + +func (m noopTableMetaMgr) FinishTable(ctx context.Context) error { + return nil +} diff --git a/pkg/lightning/restore/meta_manager_test.go b/pkg/lightning/restore/meta_manager_test.go new file mode 100644 index 000000000..bfe35106e --- /dev/null +++ b/pkg/lightning/restore/meta_manager_test.go @@ -0,0 +1,242 @@ +package restore + +import ( + "context" + "database/sql" + "database/sql/driver" + + "github.com/DATA-DOG/go-sqlmock" + . "github.com/pingcap/check" + "github.com/pingcap/parser" + "github.com/pingcap/parser/ast" + "github.com/pingcap/parser/model" + "github.com/pingcap/tidb/ddl" + tmock "github.com/pingcap/tidb/util/mock" + "go.uber.org/zap" + + "github.com/pingcap/br/pkg/lightning/checkpoints" + "github.com/pingcap/br/pkg/lightning/common" + "github.com/pingcap/br/pkg/lightning/log" + "github.com/pingcap/br/pkg/lightning/verification" +) + +var _ = Suite(&metaMgrSuite{}) + +type metaMgrSuite struct { + dbHandle *sql.DB + mockDB sqlmock.Sqlmock + tr *TableRestore + mgr *dbTableMetaMgr + checksumMgr *testChecksumMgr +} + +func (s *metaMgrSuite) SetUpSuite(c *C) { + p := parser.New() + se := tmock.NewContext() + + node, err := p.ParseOneStmt("CREATE TABLE `t1` (`c1` varchar(5) NOT NULL)", "utf8mb4", "utf8mb4_bin") + c.Assert(err, IsNil) + tableInfo, err := ddl.MockTableInfo(se, node.(*ast.CreateTableStmt), int64(1)) + c.Assert(err, IsNil) + tableInfo.State = model.StatePublic + + schema := "test" + tb := "t1" + ti := &checkpoints.TidbTableInfo{ + ID: tableInfo.ID, + DB: schema, + Name: tb, + Core: tableInfo, + } + + tableName := common.UniqueTable(schema, tb) + logger := log.With(zap.String("table", tableName)) + s.tr = &TableRestore{ + tableName: tableName, + tableInfo: ti, + logger: logger, + } +} + +func (s *metaMgrSuite) SetUpTest(c *C) { + db, m, err := sqlmock.New() + c.Assert(err, IsNil) + + s.mgr = &dbTableMetaMgr{ + session: db, + taskID: 1, + tr: s.tr, + tableName: common.UniqueTable("test", tableMetaTableName), + } + s.mockDB = m + s.checksumMgr = &testChecksumMgr{} +} + +func (s *metaMgrSuite) TearDownTest(c *C) { + c.Assert(s.mockDB.ExpectationsWereMet(), IsNil) +} + +func (s *metaMgrSuite) TestAllocTableRowIDsSingleTable(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, + } + nextID := int64(1) + updateArgs := []driver.Value{int64(0), int64(10), "restore", int64(1), int64(1)} + s.prepareMock(rows, &nextID, updateArgs, nil, nil) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(0)) + c.Assert(ck, IsNil) + c.Assert(s.checksumMgr.callCnt, Equals, 0) +} + +func (s *metaMgrSuite) TestAllocTableRowIDsSingleTableAutoIDNot0(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, + } + nextID := int64(999) + updateArgs := []driver.Value{int64(998), int64(1008), "allocated", int64(1), int64(1)} + newStatus := "restore" + s.prepareMock(rows, &nextID, updateArgs, nil, &newStatus) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(998)) + c.Assert(ck, IsNil) + c.Assert(s.checksumMgr.callCnt, Equals, 1) +} + +func (s *metaMgrSuite) TestAllocTableRowIDsSingleTableContainsData(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, + } + nextID := int64(999) + checksum := verification.MakeKVChecksum(1, 2, 3) + updateArgs := []driver.Value{int64(998), int64(1008), "allocated", int64(1), int64(1)} + s.prepareMock(rows, &nextID, updateArgs, &checksum, nil) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(998)) + c.Assert(ck, DeepEquals, &checksum) + c.Assert(s.checksumMgr.callCnt, Equals, 1) +} + +func (s *metaMgrSuite) TestAllocTableRowIDsAllocated(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(998), int64(1008), uint64(0), uint64(0), uint64(0), metaStatusRowIDAllocated.String()}, + } + checksum := verification.MakeKVChecksum(2, 1, 3) + s.prepareMock(rows, nil, nil, &checksum, nil) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(998)) + c.Assert(ck, DeepEquals, &checksum) + c.Assert(s.checksumMgr.callCnt, Equals, 1) +} + +func (s *metaMgrSuite) TestAllocTableRowIDsFinished(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(998), int64(1008), uint64(1), uint64(2), uint64(3), metaStatusRestoreStarted.String()}, + } + checksum := verification.MakeKVChecksum(2, 1, 3) + s.prepareMock(rows, nil, nil, nil, nil) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(998)) + c.Assert(ck, DeepEquals, &checksum) + c.Assert(s.checksumMgr.callCnt, Equals, 0) +} + +func (s *metaMgrSuite) TestAllocTableRowIDsMultiTasksInit(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, + {int64(2), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, + } + nextID := int64(1) + updateArgs := []driver.Value{int64(0), int64(10), "restore", int64(1), int64(1)} + s.prepareMock(rows, &nextID, updateArgs, nil, nil) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(0)) + c.Assert(ck, IsNil) + c.Assert(s.checksumMgr.callCnt, Equals, 0) +} + +func (s *metaMgrSuite) TestAllocTableRowIDsMultiTasksAllocated(c *C) { + ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) + + rows := [][]driver.Value{ + {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), metaStatusInitial.String()}, + {int64(2), int64(0), int64(100), uint64(0), uint64(0), uint64(0), metaStatusRowIDAllocated.String()}, + } + updateArgs := []driver.Value{int64(100), int64(110), "restore", int64(1), int64(1)} + s.prepareMock(rows, nil, updateArgs, nil, nil) + + ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) + c.Assert(err, IsNil) + c.Assert(rowIDBase, Equals, int64(100)) + c.Assert(ck, IsNil) + c.Assert(s.checksumMgr.callCnt, Equals, 0) +} + +func (s *metaMgrSuite) prepareMock(rowsVal [][]driver.Value, nextRowID *int64, updateArgs []driver.Value, checksum *verification.KVChecksum, updateStatus *string) { + s.mockDB.ExpectExec("SET SESSION tidb_txn_mode = 'pessimistic';"). + WillReturnResult(sqlmock.NewResult(int64(0), int64(0))) + + s.mockDB.ExpectBegin() + + rows := sqlmock.NewRows([]string{"task_id", "row_id_base", "row_id_max", "total_kvs_base", "total_bytes_base", "checksum_base", "status"}) + for _, r := range rowsVal { + rows = rows.AddRow(r...) + } + s.mockDB.ExpectQuery("\\QSELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from `test`.`table_meta` WHERE table_id = ? FOR UPDATE\\E"). + WithArgs(int64(1)). + WillReturnRows(rows) + if nextRowID != nil { + s.mockDB.ExpectQuery("SHOW TABLE `test`.`t1` NEXT_ROW_ID"). + WillReturnRows(sqlmock.NewRows([]string{"DB_NAME", "TABLE_NAME", "COLUMN_NAME", "NEXT_GLOBAL_ROW_ID", "ID_TYPE"}). + AddRow("test", "t1", "_tidb_rowid", *nextRowID, "AUTO_INCREMENT")) + } + + if len(updateArgs) > 0 { + s.mockDB.ExpectExec("\\Qupdate `test`.`table_meta` set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?\\E"). + WithArgs(updateArgs...). + WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) + } + + s.mockDB.ExpectCommit() + + if checksum != nil { + s.mockDB.ExpectExec("\\Qupdate `test`.`table_meta` set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?\\E"). + WithArgs(checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), int64(1), int64(1)). + WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) + s.checksumMgr.checksum = RemoteChecksum{ + TotalBytes: checksum.SumSize(), + TotalKVs: checksum.SumKVS(), + Checksum: checksum.Sum(), + } + } + + if updateStatus != nil { + s.mockDB.ExpectExec("\\Qupdate `test`.`table_meta` set status = ? where table_id = ? and task_id = ?\\E"). + WithArgs(*updateStatus, int64(1), int64(1)). + WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) + } +} diff --git a/pkg/lightning/restore/restore.go b/pkg/lightning/restore/restore.go index 50dab6423..b83963f62 100644 --- a/pkg/lightning/restore/restore.go +++ b/pkg/lightning/restore/restore.go @@ -16,7 +16,6 @@ package restore import ( "context" "database/sql" - "encoding/json" "fmt" "io" "math" @@ -32,7 +31,6 @@ import ( "github.com/pingcap/failpoint" sstpb "github.com/pingcap/kvproto/pkg/import_sstpb" "github.com/pingcap/parser/model" - "github.com/pingcap/parser/mysql" "github.com/pingcap/tidb/meta/autoid" "github.com/pingcap/tidb/store/tikv/oracle" "github.com/pingcap/tidb/table" @@ -59,7 +57,6 @@ import ( "github.com/pingcap/br/pkg/lightning/web" "github.com/pingcap/br/pkg/lightning/worker" "github.com/pingcap/br/pkg/pdutil" - "github.com/pingcap/br/pkg/redact" "github.com/pingcap/br/pkg/storage" "github.com/pingcap/br/pkg/utils" "github.com/pingcap/br/pkg/version" @@ -88,7 +85,7 @@ const ( taskMetaTableName = "task_meta" tableMetaTableName = "table_meta" // CreateTableMetadataTable stores the per-table sub jobs information used by TiDB Lightning - CreateTableMetadataTable = `CREATE TABLE IF NOT EXISTS %s.%s ( + CreateTableMetadataTable = `CREATE TABLE IF NOT EXISTS %s ( task_id BIGINT(20) UNSIGNED, table_id BIGINT(64) NOT NULL, table_name VARCHAR(64) NOT NULL, @@ -104,7 +101,7 @@ const ( PRIMARY KEY (table_id, task_id) );` // CreateTaskMetaTable stores the pre-lightning metadata used by TiDB Lightning - CreateTaskMetaTable = `CREATE TABLE IF NOT EXISTS %s.%s ( + CreateTaskMetaTable = `CREATE TABLE IF NOT EXISTS %s ( task_id BIGINT(20) UNSIGNED NOT NULL, pd_cfgs VARCHAR(2048) NOT NULL DEFAULT '', status VARCHAR(32) NOT NULL, @@ -2933,789 +2930,3 @@ func (cr *chunkRestore) restore( return ctx.Err() } } - -type metaMgrBuilder interface { - Init(ctx context.Context) error - TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr - TableMetaMgr(tr *TableRestore) tableMetaMgr -} - -type dbMetaMgrBuilder struct { - db *sql.DB - taskID int64 - schema string -} - -func (b *dbMetaMgrBuilder) Init(ctx context.Context) error { - exec := common.SQLWithRetry{ - DB: b.db, - Logger: log.L(), - HideQueryLog: redact.NeedRedact(), - } - metaDBSQL := fmt.Sprintf("CREATE DATABASE IF NOT EXISTS `%s`", b.schema) - if err := exec.Exec(ctx, "create meta schema", metaDBSQL); err != nil { - return errors.Annotate(err, "create meta schema failed") - } - taskMetaSQL := fmt.Sprintf(CreateTaskMetaTable, b.schema, taskMetaTableName) - if err := exec.Exec(ctx, "create meta table", taskMetaSQL); err != nil { - return errors.Annotate(err, "create task meta table failed") - } - tableMetaSQL := fmt.Sprintf(CreateTableMetadataTable, b.schema, tableMetaTableName) - if err := exec.Exec(ctx, "create meta table", tableMetaSQL); err != nil { - return errors.Annotate(err, "create table meta table failed") - } - return nil -} - -func (b *dbMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr { - return &dbTaskMetaMgr{ - session: b.db, - taskID: b.taskID, - pd: pd, - tableName: common.UniqueTable(b.schema, taskMetaTableName), - schemaName: b.schema, - } -} - -func (b *dbMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr { - return &dbTableMetaMgr{ - session: b.db, - taskID: b.taskID, - tr: tr, - tableName: common.UniqueTable(b.schema, tableMetaTableName), - } -} - -type tableMetaMgr interface { - InitTableMeta(ctx context.Context) error - AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) - UpdateTableStatus(ctx context.Context, status metaStatus) error - UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error - CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) - FinishTable(ctx context.Context) error -} - -type dbTableMetaMgr struct { - session *sql.DB - taskID int64 - tr *TableRestore - tableName string -} - -func (m *dbTableMetaMgr) InitTableMeta(ctx context.Context) error { - exec := &common.SQLWithRetry{ - DB: m.session, - Logger: m.tr.logger, - } - // avoid override existing metadata if the meta is already inserted. - stmt := fmt.Sprintf(`INSERT IGNORE INTO %s (task_id, table_id, table_name, status) values (?, ?, ?, ?)`, m.tableName) - task := m.tr.logger.Begin(zap.DebugLevel, "init table meta") - err := exec.Exec(ctx, "init table meta", stmt, m.taskID, m.tr.tableInfo.ID, m.tr.tableName, metaStatusInitial.String()) - task.End(zap.ErrorLevel, err) - return errors.Trace(err) -} - -type metaStatus uint32 - -const ( - metaStatusInitial metaStatus = iota - metaStatusRowIDAllocated - metaStatusRestoreStarted - metaStatusRestoreFinished - metaStatusChecksuming - metaStatusChecksumSkipped - metaStatusFinished -) - -func (m metaStatus) String() string { - switch m { - case metaStatusInitial: - return "initialized" - case metaStatusRowIDAllocated: - return "allocated" - case metaStatusRestoreStarted: - return "restore" - case metaStatusRestoreFinished: - return "restore_finished" - case metaStatusChecksuming: - return "checksuming" - case metaStatusChecksumSkipped: - return "checksum_skipped" - case metaStatusFinished: - return "finish" - default: - panic(fmt.Sprintf("unexpected metaStatus value '%d'", m)) - } -} - -func parseMetaStatus(s string) (metaStatus, error) { - switch s { - case "", "initialized": - return metaStatusInitial, nil - case "allocated": - return metaStatusRowIDAllocated, nil - case "restore": - return metaStatusRestoreStarted, nil - case "restore_finished": - return metaStatusRestoreFinished, nil - case "checksuming": - return metaStatusChecksuming, nil - case "checksum_skipped": - return metaStatusChecksumSkipped, nil - case "finish": - return metaStatusFinished, nil - default: - return metaStatusInitial, errors.Errorf("invalid meta status '%s'", s) - } -} - -func (m *dbTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) { - conn, err := m.session.Conn(ctx) - if err != nil { - return nil, 0, errors.Trace(err) - } - defer conn.Close() - exec := &common.SQLWithRetry{ - DB: m.session, - Logger: m.tr.logger, - } - var newRowIDBase, newRowIDMax int64 - curStatus := metaStatusInitial - newStatus := metaStatusRowIDAllocated - var baseTotalKvs, baseTotalBytes, baseChecksum uint64 - err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") - if err != nil { - return nil, 0, errors.Annotate(err, "enable pessimistic transaction failed") - } - needAutoID := common.TableHasAutoRowID(m.tr.tableInfo.Core) || m.tr.tableInfo.Core.GetAutoIncrementColInfo() != nil || m.tr.tableInfo.Core.ContainsAutoRandomBits() - err = exec.Transact(ctx, "init table allocator base", func(ctx context.Context, tx *sql.Tx) error { - query := fmt.Sprintf("SELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from %s WHERE table_id = ? FOR UPDATE", m.tableName) - rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) - if err != nil { - return errors.Trace(err) - } - defer rows.Close() - var ( - metaTaskID, rowIDBase, rowIDMax, maxRowIDMax int64 - totalKvs, totalBytes, checksum uint64 - statusValue string - ) - for rows.Next() { - if err = rows.Scan(&metaTaskID, &rowIDBase, &rowIDMax, &totalKvs, &totalBytes, &checksum, &statusValue); err != nil { - return errors.Trace(err) - } - status, err := parseMetaStatus(statusValue) - if err != nil { - return errors.Annotatef(err, "invalid meta status '%s'", statusValue) - } - - // skip finished meta - if status >= metaStatusFinished { - continue - } - - if status == metaStatusChecksuming { - return errors.New("target table is calculating checksum, please wait unit the checksum is finished and try again.") - } - - if metaTaskID == m.taskID { - curStatus = status - baseChecksum = checksum - baseTotalKvs = totalKvs - baseTotalBytes = totalBytes - if status >= metaStatusRowIDAllocated { - if rowIDMax-rowIDBase != rawRowIDMax { - return errors.Errorf("verify allocator base failed. local: '%d', meta: '%d'", rawRowIDMax, rowIDMax-rowIDBase) - } - newRowIDBase = rowIDBase - newRowIDMax = rowIDMax - break - } - continue - } - - // other tasks has finished this logic, we needn't do again. - if status >= metaStatusRowIDAllocated { - newStatus = metaStatusRestoreStarted - } - - if rowIDMax > maxRowIDMax { - maxRowIDMax = rowIDMax - } - } - - // no enough info are available, fetch row_id max for table - if curStatus == metaStatusInitial { - if needAutoID && maxRowIDMax == 0 { - // NOTE: currently, if a table contains auto_incremental unique key and _tidb_rowid, - // the `show table next_row_id` will returns the unique key field only. - var autoIDField string - for _, col := range m.tr.tableInfo.Core.Columns { - if mysql.HasAutoIncrementFlag(col.Flag) { - autoIDField = col.Name.L - break - } else if mysql.HasPriKeyFlag(col.Flag) && m.tr.tableInfo.Core.AutoRandomBits > 0 { - autoIDField = col.Name.L - break - } - } - if len(autoIDField) == 0 && common.TableHasAutoRowID(m.tr.tableInfo.Core) { - autoIDField = model.ExtraHandleName.L - } - if len(autoIDField) == 0 { - return errors.Errorf("table %s contains auto increment id or _tidb_rowid, but target field not found", m.tr.tableName) - } - - autoIDInfos, err := tidb.FetchTableAutoIDInfos(ctx, tx, m.tr.tableName) - if err != nil { - return errors.Trace(err) - } - found := false - for _, info := range autoIDInfos { - if strings.ToLower(info.Column) == autoIDField { - maxRowIDMax = info.NextID - 1 - found = true - break - } - } - if !found { - return errors.Errorf("can't fetch previous auto id base for table %s field '%s'", m.tr.tableName, autoIDField) - } - } - newRowIDBase = maxRowIDMax - newRowIDMax = newRowIDBase + rawRowIDMax - // table contains no data, can skip checksum - if needAutoID && newRowIDBase == 0 && newStatus < metaStatusRestoreStarted { - newStatus = metaStatusRestoreStarted - } - query = fmt.Sprintf("update %s set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?", m.tableName) - _, err := tx.ExecContext(ctx, query, newRowIDBase, newRowIDMax, newStatus.String(), m.tr.tableInfo.ID, m.taskID) - if err != nil { - return errors.Trace(err) - } - - curStatus = newStatus - } - return nil - }) - if err != nil { - return nil, 0, errors.Trace(err) - } - - var checksum *verify.KVChecksum - // need to do checksum and update checksum meta since we are the first one. - if curStatus < metaStatusRestoreStarted { - // table contains data but haven't do checksum yet - if (newRowIDBase > 0 || !needAutoID) && baseTotalKvs == 0 { - remoteCk, err := DoChecksum(ctx, m.tr.tableInfo) - if err != nil { - return nil, 0, errors.Trace(err) - } - - if remoteCk.Checksum != baseChecksum || remoteCk.TotalKVs != baseTotalKvs || remoteCk.TotalBytes != baseTotalBytes { - ck := verify.MakeKVChecksum(remoteCk.TotalBytes, remoteCk.TotalKVs, remoteCk.Checksum) - checksum = &ck - } - - } - - if checksum != nil { - if err = m.UpdateTableBaseChecksum(ctx, checksum); err != nil { - return nil, 0, errors.Trace(err) - } - - m.tr.logger.Info("checksum before restore table", zap.Object("checksum", checksum)) - } else if err = m.UpdateTableStatus(ctx, metaStatusRestoreStarted); err != nil { - return nil, 0, errors.Trace(err) - } - } - if checksum == nil && baseTotalKvs > 0 { - ck := verify.MakeKVChecksum(baseTotalBytes, baseTotalKvs, baseChecksum) - checksum = &ck - } - log.L().Info("allocate table row_id base", zap.String("table", m.tr.tableName), - zap.Int64("row_id_base", newRowIDBase)) - if checksum != nil { - log.L().Info("checksum base", zap.Any("checksum", checksum)) - } - return checksum, newRowIDBase, nil -} - -func (m *dbTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error { - exec := &common.SQLWithRetry{ - DB: m.session, - Logger: m.tr.logger, - } - query := fmt.Sprintf("update %s set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?", m.tableName) - - return exec.Exec(ctx, "update base checksum", query, checksum.SumKVS(), - checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), m.tr.tableInfo.ID, m.taskID) -} - -func (m *dbTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error { - exec := &common.SQLWithRetry{ - DB: m.session, - Logger: m.tr.logger, - } - query := fmt.Sprintf("update %s set status = ? where table_id = ? and task_id = ?", m.tableName) - return exec.Exec(ctx, "update meta status", query, status.String(), m.tr.tableInfo.ID, m.taskID) -} - -func (m *dbTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { - conn, err := m.session.Conn(ctx) - if err != nil { - return false, nil, errors.Trace(err) - } - defer conn.Close() - exec := &common.SQLWithRetry{ - DB: m.session, - Logger: m.tr.logger, - } - err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") - if err != nil { - return false, nil, errors.Annotate(err, "enable pessimistic transaction failed") - } - var ( - baseTotalKvs, baseTotalBytes, baseChecksum uint64 - taskKvs, taskBytes, taskChecksum uint64 - totalKvs, totalBytes, totalChecksum uint64 - ) - newStatus := metaStatusChecksuming - needChecksum := true - err = exec.Transact(ctx, "checksum pre-check", func(ctx context.Context, tx *sql.Tx) error { - query := fmt.Sprintf("SELECT task_id, total_kvs_base, total_bytes_base, checksum_base, total_kvs, total_bytes, checksum, status from %s WHERE table_id = ? FOR UPDATE", m.tableName) - rows, err := tx.QueryContext(ctx, query, m.tr.tableInfo.ID) - if err != nil { - return errors.Annotate(err, "fetch task meta failed") - } - closed := false - defer func() { - if !closed { - rows.Close() - } - }() - var ( - taskID int64 - statusValue string - ) - for rows.Next() { - if err = rows.Scan(&taskID, &baseTotalKvs, &baseTotalBytes, &baseChecksum, &taskKvs, &taskBytes, &taskChecksum, &statusValue); err != nil { - return errors.Trace(err) - } - status, err := parseMetaStatus(statusValue) - if err != nil { - return errors.Annotatef(err, "invalid meta status '%s'", statusValue) - } - - // skip finished meta - if status >= metaStatusFinished { - continue - } - - if taskID == m.taskID { - if status >= metaStatusChecksuming { - newStatus = status - needChecksum = status == metaStatusChecksuming - return nil - } - - continue - } - - if status < metaStatusChecksuming { - newStatus = metaStatusChecksumSkipped - needChecksum = false - break - } else if status == metaStatusChecksuming { - return errors.New("another task is checksuming, there must be something wrong!") - } - - totalBytes += baseTotalBytes - totalKvs += baseTotalKvs - totalChecksum ^= baseChecksum - - totalBytes += taskBytes - totalKvs += taskKvs - totalChecksum ^= taskChecksum - } - rows.Close() - closed = true - - query = fmt.Sprintf("update %s set total_kvs = ?, total_bytes = ?, checksum = ?, status = ? where table_id = ? and task_id = ?", m.tableName) - _, err = tx.ExecContext(ctx, query, checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), newStatus.String(), m.tr.tableInfo.ID, m.taskID) - return errors.Annotate(err, "update local checksum failed") - }) - if err != nil { - return false, nil, err - } - - var remoteChecksum *verify.KVChecksum - if needChecksum { - ck := verify.MakeKVChecksum(totalBytes, totalKvs, totalChecksum) - remoteChecksum = &ck - } - log.L().Info("check table checksum", zap.String("table", m.tr.tableName), - zap.Bool("checksum", needChecksum), zap.String("new_status", newStatus.String())) - return needChecksum, remoteChecksum, nil -} - -func (m *dbTableMetaMgr) FinishTable(ctx context.Context) error { - exec := &common.SQLWithRetry{ - DB: m.session, - Logger: m.tr.logger, - } - query := fmt.Sprintf("DELETE FROM %s where table_id = ? and (status = 'checksuming' or status = 'checksum_skipped')", m.tableName) - return exec.Exec(ctx, "clean up metas", query, m.tr.tableInfo.ID) -} - -type taskMetaMgr interface { - InitTask(ctx context.Context) error - CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) - CheckAndFinishRestore(ctx context.Context) (bool, error) - Cleanup(ctx context.Context) error - CleanupAllMetas(ctx context.Context) error -} - -type dbTaskMetaMgr struct { - session *sql.DB - taskID int64 - pd *pdutil.PdController - // unique name of task meta table - tableName string - schemaName string -} - -type taskMetaStatus uint32 - -const ( - taskMetaStatusInitial taskMetaStatus = iota - taskMetaStatusScheduleSet - taskMetaStatusSwitchSkipped - taskMetaStatusSwitchBack -) - -func (m taskMetaStatus) String() string { - switch m { - case taskMetaStatusInitial: - return "initialized" - case taskMetaStatusScheduleSet: - return "schedule_set" - case taskMetaStatusSwitchSkipped: - return "skip_switch" - case taskMetaStatusSwitchBack: - return "switched" - default: - panic(fmt.Sprintf("unexpected metaStatus value '%d'", m)) - } -} - -func parseTaskMetaStatus(s string) (taskMetaStatus, error) { - switch s { - case "", "initialized": - return taskMetaStatusInitial, nil - case "schedule_set": - return taskMetaStatusScheduleSet, nil - case "skip_switch": - return taskMetaStatusSwitchSkipped, nil - case "switched": - return taskMetaStatusSwitchBack, nil - default: - return taskMetaStatusInitial, errors.Errorf("invalid meta status '%s'", s) - } -} - -type storedCfgs struct { - PauseCfg pdutil.ClusterConfig `json:"paused"` - RestoreCfg pdutil.ClusterConfig `json:"restore"` -} - -func (m *dbTaskMetaMgr) InitTask(ctx context.Context) error { - exec := &common.SQLWithRetry{ - DB: m.session, - Logger: log.L(), - } - // avoid override existing metadata if the meta is already inserted. - stmt := fmt.Sprintf(`INSERT IGNORE INTO %s (task_id, status) values (?, ?)`, m.tableName) - err := exec.Exec(ctx, "init task meta", stmt, m.taskID, taskMetaStatusInitial.String()) - return errors.Trace(err) -} - -func (m *dbTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { - pauseCtx, cancel := context.WithCancel(ctx) - conn, err := m.session.Conn(ctx) - if err != nil { - return nil, errors.Trace(err) - } - defer conn.Close() - exec := &common.SQLWithRetry{ - DB: m.session, - Logger: log.L(), - } - err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") - if err != nil { - return nil, errors.Annotate(err, "enable pessimistic transaction failed") - } - - needSwitch := true - paused := false - var pausedCfg storedCfgs - err = exec.Transact(ctx, "check and pause schedulers", func(ctx context.Context, tx *sql.Tx) error { - query := fmt.Sprintf("SELECT task_id, pd_cfgs, status from %s FOR UPDATE", m.tableName) - rows, err := tx.QueryContext(ctx, query) - if err != nil { - return errors.Annotate(err, "fetch task meta failed") - } - closed := false - defer func() { - if !closed { - rows.Close() - } - }() - var ( - taskID int64 - cfg string - statusValue string - ) - var cfgStr string - for rows.Next() { - if err = rows.Scan(&taskID, &cfg, &statusValue); err != nil { - return errors.Trace(err) - } - status, err := parseTaskMetaStatus(statusValue) - if err != nil { - return errors.Annotatef(err, "invalid task meta status '%s'", statusValue) - } - - if status == taskMetaStatusInitial { - continue - } - - if taskID == m.taskID { - if status >= taskMetaStatusSwitchSkipped { - needSwitch = false - return nil - } - } - - if cfg != "" { - cfgStr = cfg - break - } - } - if err = rows.Close(); err != nil { - return errors.Trace(err) - } - closed = true - - if cfgStr != "" { - err = json.Unmarshal([]byte(cfgStr), &pausedCfg) - return errors.Trace(err) - } - - orig, removed, err := m.pd.RemoveSchedulersWithOrigin(pauseCtx) - if err != nil { - return errors.Trace(err) - } - paused = true - - pausedCfg = storedCfgs{PauseCfg: removed, RestoreCfg: orig} - jsonByts, err := json.Marshal(&pausedCfg) - if err != nil { - return errors.Trace(err) - } - - query = fmt.Sprintf("update %s set pd_cfgs = ?, status = ? where task_id = ?", m.tableName) - _, err = tx.ExecContext(ctx, query, string(jsonByts), taskMetaStatusScheduleSet.String(), m.taskID) - - return errors.Annotate(err, "update task pd configs failed") - }) - if err != nil { - return nil, err - } - - if !needSwitch { - return nil, nil - } - - if !paused { - if err = m.pd.RemoveSchedulersWithCfg(pauseCtx, pausedCfg.PauseCfg); err != nil { - return nil, err - } - } - - cancelFunc := m.pd.MakeUndoFunctionByConfig(pausedCfg.RestoreCfg) - - return func(ctx context.Context) error { - // close the periodic task ctx - cancel() - return cancelFunc(ctx) - }, nil -} - -func (m *dbTaskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { - conn, err := m.session.Conn(ctx) - if err != nil { - return false, errors.Trace(err) - } - defer conn.Close() - exec := &common.SQLWithRetry{ - DB: m.session, - Logger: log.L(), - } - err = exec.Exec(ctx, "enable pessimistic transaction", "SET SESSION tidb_txn_mode = 'pessimistic';") - if err != nil { - return false, errors.Annotate(err, "enable pessimistic transaction failed") - } - - switchBack := true - err = exec.Transact(ctx, "check and finish schedulers", func(ctx context.Context, tx *sql.Tx) error { - query := fmt.Sprintf("SELECT task_id, status from %s FOR UPDATE", m.tableName) - rows, err := tx.QueryContext(ctx, query) - if err != nil { - return errors.Annotate(err, "fetch task meta failed") - } - closed := false - defer func() { - if !closed { - rows.Close() - } - }() - var ( - taskID int64 - statusValue string - ) - newStatus := taskMetaStatusSwitchBack - for rows.Next() { - if err = rows.Scan(&taskID, &statusValue); err != nil { - return errors.Trace(err) - } - status, err := parseTaskMetaStatus(statusValue) - if err != nil { - return errors.Annotatef(err, "invalid task meta status '%s'", statusValue) - } - - if taskID == m.taskID { - continue - } - - if status < taskMetaStatusSwitchSkipped { - newStatus = taskMetaStatusSwitchSkipped - switchBack = false - break - } - } - if err = rows.Close(); err != nil { - return errors.Trace(err) - } - closed = true - - query = fmt.Sprintf("update %s set status = ? where task_id = ?", m.tableName) - _, err = tx.ExecContext(ctx, query, newStatus.String(), m.taskID) - - return errors.Trace(err) - }) - - return switchBack, err -} - -func (m *dbTaskMetaMgr) Cleanup(ctx context.Context) error { - exec := &common.SQLWithRetry{ - DB: m.session, - Logger: log.L(), - } - // avoid override existing metadata if the meta is already inserted. - stmt := fmt.Sprintf("DROP TABLE %s;", m.tableName) - if err := exec.Exec(ctx, "cleanup task meta tables", stmt); err != nil { - return errors.Trace(err) - } - return nil -} - -func (m *dbTaskMetaMgr) CleanupAllMetas(ctx context.Context) error { - exec := &common.SQLWithRetry{ - DB: m.session, - Logger: log.L(), - } - - // check if all tables are finished - query := fmt.Sprintf("SELECT COUNT(*) from %s", common.UniqueTable(m.schemaName, tableMetaTableName)) - var cnt int - if err := exec.QueryRow(ctx, "fetch table meta row count", query, &cnt); err != nil { - return errors.Trace(err) - } - if cnt > 0 { - log.L().Warn("there are unfinished table in table meta table, cleanup skipped.") - return nil - } - - // avoid override existing metadata if the meta is already inserted. - stmt := fmt.Sprintf("DROP DATABASE %s;", m.schemaName) - if err := exec.Exec(ctx, "cleanup task meta tables", stmt); err != nil { - return errors.Trace(err) - } - return nil -} - -type noopMetaMgrBuilder struct{} - -func (b noopMetaMgrBuilder) Init(ctx context.Context) error { - return nil -} - -func (b noopMetaMgrBuilder) TaskMetaMgr(pd *pdutil.PdController) taskMetaMgr { - return noopTaskMetaMgr{} -} - -func (b noopMetaMgrBuilder) TableMetaMgr(tr *TableRestore) tableMetaMgr { - return noopTableMetaMgr{} -} - -type noopTaskMetaMgr struct{} - -func (m noopTaskMetaMgr) InitTask(ctx context.Context) error { - return nil -} - -func (m noopTaskMetaMgr) CheckAndPausePdSchedulers(ctx context.Context) (pdutil.UndoFunc, error) { - return func(ctx context.Context) error { - return nil - }, nil -} - -func (m noopTaskMetaMgr) CheckAndFinishRestore(ctx context.Context) (bool, error) { - return false, nil -} - -func (m noopTaskMetaMgr) Cleanup(ctx context.Context) error { - return nil -} - -func (m noopTaskMetaMgr) CleanupAllMetas(ctx context.Context) error { - return nil -} - -type noopTableMetaMgr struct{} - -func (m noopTableMetaMgr) InitTableMeta(ctx context.Context) error { - return nil -} - -func (m noopTableMetaMgr) AllocTableRowIDs(ctx context.Context, rawRowIDMax int64) (*verify.KVChecksum, int64, error) { - return nil, 0, nil -} - -func (m noopTableMetaMgr) UpdateTableStatus(ctx context.Context, status metaStatus) error { - return nil -} - -func (m noopTableMetaMgr) UpdateTableBaseChecksum(ctx context.Context, checksum *verify.KVChecksum) error { - return nil -} - -func (m noopTableMetaMgr) CheckAndUpdateLocalChecksum(ctx context.Context, checksum *verify.KVChecksum) (bool, *verify.KVChecksum, error) { - return false, nil, nil -} - -func (m noopTableMetaMgr) FinishTable(ctx context.Context) error { - return nil -} diff --git a/pkg/lightning/restore/restore_test.go b/pkg/lightning/restore/restore_test.go index 8ec8bfdd7..e4b57a9b1 100644 --- a/pkg/lightning/restore/restore_test.go +++ b/pkg/lightning/restore/restore_test.go @@ -15,8 +15,6 @@ package restore import ( "context" - "database/sql" - "database/sql/driver" "fmt" "io/ioutil" "path/filepath" @@ -25,19 +23,6 @@ import ( "github.com/DATA-DOG/go-sqlmock" "github.com/golang/mock/gomock" "github.com/google/uuid" - . "github.com/pingcap/check" - "github.com/pingcap/errors" - "github.com/pingcap/failpoint" - "github.com/pingcap/kvproto/pkg/import_kvpb" - "github.com/pingcap/parser" - "github.com/pingcap/parser/ast" - "github.com/pingcap/parser/model" - "github.com/pingcap/parser/mysql" - filter "github.com/pingcap/tidb-tools/pkg/table-filter" - "github.com/pingcap/tidb/ddl" - tmock "github.com/pingcap/tidb/util/mock" - "go.uber.org/zap" - "github.com/pingcap/br/pkg/lightning/backend" "github.com/pingcap/br/pkg/lightning/backend/importer" "github.com/pingcap/br/pkg/lightning/backend/kv" @@ -56,6 +41,17 @@ import ( "github.com/pingcap/br/pkg/mock" "github.com/pingcap/br/pkg/storage" "github.com/pingcap/br/pkg/version/build" + . "github.com/pingcap/check" + "github.com/pingcap/errors" + "github.com/pingcap/failpoint" + "github.com/pingcap/kvproto/pkg/import_kvpb" + "github.com/pingcap/parser" + "github.com/pingcap/parser/ast" + "github.com/pingcap/parser/model" + "github.com/pingcap/parser/mysql" + filter "github.com/pingcap/tidb-tools/pkg/table-filter" + "github.com/pingcap/tidb/ddl" + tmock "github.com/pingcap/tidb/util/mock" ) var _ = Suite(&restoreSuite{}) @@ -1505,224 +1501,3 @@ func (t *testChecksumMgr) Checksum(ctx context.Context, tableInfo *checkpoints.T t.callCnt++ return &t.checksum, nil } - -var _ = Suite(&metaMgrSuite{}) - -type metaMgrSuite struct { - dbHandle *sql.DB - mockDB sqlmock.Sqlmock - tr *TableRestore - mgr *dbTableMetaMgr - checksumMgr *testChecksumMgr -} - -func (s *metaMgrSuite) SetUpSuite(c *C) { - p := parser.New() - se := tmock.NewContext() - - node, err := p.ParseOneStmt("CREATE TABLE `t1` (`c1` varchar(5) NOT NULL)", "utf8mb4", "utf8mb4_bin") - c.Assert(err, IsNil) - tableInfo, err := ddl.MockTableInfo(se, node.(*ast.CreateTableStmt), int64(1)) - c.Assert(err, IsNil) - tableInfo.State = model.StatePublic - - schema := "test" - tb := "t1" - ti := &checkpoints.TidbTableInfo{ - ID: tableInfo.ID, - DB: schema, - Name: tb, - Core: tableInfo, - } - - tableName := common.UniqueTable(schema, tb) - logger := log.With(zap.String("table", tableName)) - s.tr = &TableRestore{ - tableName: tableName, - tableInfo: ti, - logger: logger, - } -} - -func (s *metaMgrSuite) SetUpTest(c *C) { - db, m, err := sqlmock.New() - c.Assert(err, IsNil) - - s.mgr = &dbTableMetaMgr{ - session: db, - taskID: 1, - tr: s.tr, - tableName: common.UniqueTable("test", tableMetaTableName), - } - s.mockDB = m - s.checksumMgr = &testChecksumMgr{} -} - -func (s *metaMgrSuite) TearDownTest(c *C) { - c.Assert(s.mockDB.ExpectationsWereMet(), IsNil) -} - -func (s *metaMgrSuite) TestAllocTableRowIDsSingleTable(c *C) { - ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) - - rows := [][]driver.Value{ - {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, - } - nextID := int64(1) - updateArgs := []driver.Value{int64(0), int64(10), "restore", int64(1), int64(1)} - s.prepareMock(rows, &nextID, updateArgs, nil, nil) - - ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) - c.Assert(err, IsNil) - c.Assert(rowIDBase, Equals, int64(0)) - c.Assert(ck, IsNil) - c.Assert(s.checksumMgr.callCnt, Equals, 0) -} - -func (s *metaMgrSuite) TestAllocTableRowIDsSingleTableAutoIDNot0(c *C) { - ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) - - rows := [][]driver.Value{ - {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, - } - nextID := int64(999) - updateArgs := []driver.Value{int64(998), int64(1008), "allocated", int64(1), int64(1)} - newStatus := "restore" - s.prepareMock(rows, &nextID, updateArgs, nil, &newStatus) - - ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) - c.Assert(err, IsNil) - c.Assert(rowIDBase, Equals, int64(998)) - c.Assert(ck, IsNil) - c.Assert(s.checksumMgr.callCnt, Equals, 1) -} - -func (s *metaMgrSuite) TestAllocTableRowIDsSingleTableContainsData(c *C) { - ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) - - rows := [][]driver.Value{ - {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, - } - nextID := int64(999) - checksum := verification.MakeKVChecksum(1, 2, 3) - updateArgs := []driver.Value{int64(998), int64(1008), "allocated", int64(1), int64(1)} - s.prepareMock(rows, &nextID, updateArgs, &checksum, nil) - - ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) - c.Assert(err, IsNil) - c.Assert(rowIDBase, Equals, int64(998)) - c.Assert(ck, DeepEquals, &checksum) - c.Assert(s.checksumMgr.callCnt, Equals, 1) -} - -func (s *metaMgrSuite) TestAllocTableRowIDsAllocated(c *C) { - ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) - - rows := [][]driver.Value{ - {int64(1), int64(998), int64(1008), uint64(0), uint64(0), uint64(0), metaStatusRowIDAllocated.String()}, - } - checksum := verification.MakeKVChecksum(2, 1, 3) - s.prepareMock(rows, nil, nil, &checksum, nil) - - ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) - c.Assert(err, IsNil) - c.Assert(rowIDBase, Equals, int64(998)) - c.Assert(ck, DeepEquals, &checksum) - c.Assert(s.checksumMgr.callCnt, Equals, 1) -} - -func (s *metaMgrSuite) TestAllocTableRowIDsFinished(c *C) { - ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) - - rows := [][]driver.Value{ - {int64(1), int64(998), int64(1008), uint64(1), uint64(2), uint64(3), metaStatusRestoreStarted.String()}, - } - checksum := verification.MakeKVChecksum(2, 1, 3) - s.prepareMock(rows, nil, nil, nil, nil) - - ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) - c.Assert(err, IsNil) - c.Assert(rowIDBase, Equals, int64(998)) - c.Assert(ck, DeepEquals, &checksum) - c.Assert(s.checksumMgr.callCnt, Equals, 0) -} - -func (s *metaMgrSuite) TestAllocTableRowIDsMultiTasksInit(c *C) { - ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) - - rows := [][]driver.Value{ - {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, - {int64(2), int64(0), int64(0), uint64(0), uint64(0), uint64(0), "initialized"}, - } - nextID := int64(1) - updateArgs := []driver.Value{int64(0), int64(10), "restore", int64(1), int64(1)} - s.prepareMock(rows, &nextID, updateArgs, nil, nil) - - ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) - c.Assert(err, IsNil) - c.Assert(rowIDBase, Equals, int64(0)) - c.Assert(ck, IsNil) - c.Assert(s.checksumMgr.callCnt, Equals, 0) -} - -func (s *metaMgrSuite) TestAllocTableRowIDsMultiTasksAllocated(c *C) { - ctx := context.WithValue(context.Background(), &checksumManagerKey, s.checksumMgr) - - rows := [][]driver.Value{ - {int64(1), int64(0), int64(0), uint64(0), uint64(0), uint64(0), metaStatusInitial.String()}, - {int64(2), int64(0), int64(100), uint64(0), uint64(0), uint64(0), metaStatusRowIDAllocated.String()}, - } - updateArgs := []driver.Value{int64(100), int64(110), "restore", int64(1), int64(1)} - s.prepareMock(rows, nil, updateArgs, nil, nil) - - ck, rowIDBase, err := s.mgr.AllocTableRowIDs(ctx, 10) - c.Assert(err, IsNil) - c.Assert(rowIDBase, Equals, int64(100)) - c.Assert(ck, IsNil) - c.Assert(s.checksumMgr.callCnt, Equals, 0) -} - -func (s *metaMgrSuite) prepareMock(rowsVal [][]driver.Value, nextRowID *int64, updateArgs []driver.Value, checksum *verification.KVChecksum, updateStatus *string) { - s.mockDB.ExpectExec("SET SESSION tidb_txn_mode = 'pessimistic';"). - WillReturnResult(sqlmock.NewResult(int64(0), int64(0))) - - s.mockDB.ExpectBegin() - - rows := sqlmock.NewRows([]string{"task_id", "row_id_base", "row_id_max", "total_kvs_base", "total_bytes_base", "checksum_base", "status"}) - for _, r := range rowsVal { - rows = rows.AddRow(r...) - } - s.mockDB.ExpectQuery("\\QSELECT task_id, row_id_base, row_id_max, total_kvs_base, total_bytes_base, checksum_base, status from `test`.`table_meta` WHERE table_id = ? FOR UPDATE\\E"). - WithArgs(int64(1)). - WillReturnRows(rows) - if nextRowID != nil { - s.mockDB.ExpectQuery("SHOW TABLE `test`.`t1` NEXT_ROW_ID"). - WillReturnRows(sqlmock.NewRows([]string{"DB_NAME", "TABLE_NAME", "COLUMN_NAME", "NEXT_GLOBAL_ROW_ID", "ID_TYPE"}). - AddRow("test", "t1", "_tidb_rowid", *nextRowID, "AUTO_INCREMENT")) - } - - if len(updateArgs) > 0 { - s.mockDB.ExpectExec("\\Qupdate `test`.`table_meta` set row_id_base = ?, row_id_max = ?, status = ? where table_id = ? and task_id = ?\\E"). - WithArgs(updateArgs...). - WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) - } - - s.mockDB.ExpectCommit() - - if checksum != nil { - s.mockDB.ExpectExec("\\Qupdate `test`.`table_meta` set total_kvs_base = ?, total_bytes_base = ?, checksum_base = ?, status = ? where table_id = ? and task_id = ?\\E"). - WithArgs(checksum.SumKVS(), checksum.SumSize(), checksum.Sum(), metaStatusRestoreStarted.String(), int64(1), int64(1)). - WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) - s.checksumMgr.checksum = RemoteChecksum{ - TotalBytes: checksum.SumSize(), - TotalKVs: checksum.SumKVS(), - Checksum: checksum.Sum(), - } - } - - if updateStatus != nil { - s.mockDB.ExpectExec("\\Qupdate `test`.`table_meta` set status = ? where table_id = ? and task_id = ?\\E"). - WithArgs(*updateStatus, int64(1), int64(1)). - WillReturnResult(sqlmock.NewResult(int64(0), int64(1))) - } -} From ef782e18651806c0df42b9003a496c995db726c1 Mon Sep 17 00:00:00 2001 From: glorv Date: Thu, 3 Jun 2021 19:02:50 +0800 Subject: [PATCH 30/32] remove useless code --- tests/lightning_incremental/run.sh | 95 ++++++++++++++---------------- 1 file changed, 45 insertions(+), 50 deletions(-) diff --git a/tests/lightning_incremental/run.sh b/tests/lightning_incremental/run.sh index ed36c76d8..bf8ccde57 100644 --- a/tests/lightning_incremental/run.sh +++ b/tests/lightning_incremental/run.sh @@ -15,67 +15,62 @@ set -eu -# FIXME: auto-random is only stable on master currently. -check_cluster_version 4 0 0 AUTO_RANDOM || exit 0 +check_cluster_version 4 0 0 "incremental restore" || exit 0 DB_NAME=incr for backend in importer local; do - if [ "$backend" = 'local' ]; then - check_cluster_version 4 0 0 'local backend' || continue - fi - - run_sql "DROP DATABASE IF EXISTS incr;" - run_lightning --backend $backend + run_sql "DROP DATABASE IF EXISTS incr;" + run_lightning --backend $backend - for tbl in auto_random pk_auto_inc rowid_uk_inc uk_auto_inc; do - run_sql "SELECT count(*) from incr.$tbl" - check_contains "count(*): 3" - done + for tbl in auto_random pk_auto_inc rowid_uk_inc uk_auto_inc; do + run_sql "SELECT count(*) from incr.$tbl" + check_contains "count(*): 3" + done - for tbl in auto_random pk_auto_inc rowid_uk_inc uk_auto_inc; do - if [ "$tbl" = "auto_random" ]; then - run_sql "SELECT id & b'000001111111111111111111111111111111111111111111111111111111111' as inc FROM incr.$tbl" - else - run_sql "SELECT id as inc FROM incr.$tbl" - fi - check_contains 'inc: 1' - check_contains 'inc: 2' - check_contains 'inc: 3' - done + for tbl in auto_random pk_auto_inc rowid_uk_inc uk_auto_inc; do + if [ "$tbl" = "auto_random" ]; then + run_sql "SELECT id & b'000001111111111111111111111111111111111111111111111111111111111' as inc FROM incr.$tbl" + else + run_sql "SELECT id as inc FROM incr.$tbl" + fi + check_contains 'inc: 1' + check_contains 'inc: 2' + check_contains 'inc: 3' + done - for tbl in pk_auto_inc rowid_uk_inc; do - run_sql "SELECT group_concat(v) from incr.$tbl group by 'all';" - check_contains "group_concat(v): a,b,c" - done + for tbl in pk_auto_inc rowid_uk_inc; do + run_sql "SELECT group_concat(v) from incr.$tbl group by 'all';" + check_contains "group_concat(v): a,b,c" + done - run_sql "SELECT sum(pk) from incr.uk_auto_inc;" - check_contains "sum(pk): 6" + run_sql "SELECT sum(pk) from incr.uk_auto_inc;" + check_contains "sum(pk): 6" - # incrementally import all data in data1 - run_lightning --backend $backend -d "tests/$TEST_NAME/data1" + # incrementally import all data in data1 + run_lightning --backend $backend -d "tests/$TEST_NAME/data1" - for tbl in auto_random pk_auto_inc rowid_uk_inc uk_auto_inc; do - run_sql "SELECT count(*) from incr.$tbl" - check_contains "count(*): 6" - done + for tbl in auto_random pk_auto_inc rowid_uk_inc uk_auto_inc; do + run_sql "SELECT count(*) from incr.$tbl" + check_contains "count(*): 6" + done - for tbl in auto_random pk_auto_inc rowid_uk_inc uk_auto_inc; do - if [ "$tbl" = "auto_random" ]; then - run_sql "SELECT id & b'000001111111111111111111111111111111111111111111111111111111111' as inc FROM incr.$tbl" - else - run_sql "SELECT id as inc FROM incr.$tbl" - fi - check_contains 'inc: 4' - check_contains 'inc: 5' - check_contains 'inc: 6' - done + for tbl in auto_random pk_auto_inc rowid_uk_inc uk_auto_inc; do + if [ "$tbl" = "auto_random" ]; then + run_sql "SELECT id & b'000001111111111111111111111111111111111111111111111111111111111' as inc FROM incr.$tbl" + else + run_sql "SELECT id as inc FROM incr.$tbl" + fi + check_contains 'inc: 4' + check_contains 'inc: 5' + check_contains 'inc: 6' + done - for tbl in pk_auto_inc rowid_uk_inc; do - run_sql "SELECT group_concat(v) from incr.$tbl group by 'all';" - check_contains "group_concat(v): a,b,c,d,e,f" - done + for tbl in pk_auto_inc rowid_uk_inc; do + run_sql "SELECT group_concat(v) from incr.$tbl group by 'all';" + check_contains "group_concat(v): a,b,c,d,e,f" + done - run_sql "SELECT sum(pk) from incr.uk_auto_inc;" - check_contains "sum(pk): 21" + run_sql "SELECT sum(pk) from incr.uk_auto_inc;" + check_contains "sum(pk): 21" done From a895b15093ab972244837ac7d859a4c6508d3d48 Mon Sep 17 00:00:00 2001 From: glorv Date: Thu, 3 Jun 2021 19:21:29 +0800 Subject: [PATCH 31/32] fmt code --- pkg/lightning/restore/restore_test.go | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/pkg/lightning/restore/restore_test.go b/pkg/lightning/restore/restore_test.go index e4b57a9b1..49e3a1d4a 100644 --- a/pkg/lightning/restore/restore_test.go +++ b/pkg/lightning/restore/restore_test.go @@ -23,6 +23,18 @@ import ( "github.com/DATA-DOG/go-sqlmock" "github.com/golang/mock/gomock" "github.com/google/uuid" + . "github.com/pingcap/check" + "github.com/pingcap/errors" + "github.com/pingcap/failpoint" + "github.com/pingcap/kvproto/pkg/import_kvpb" + "github.com/pingcap/parser" + "github.com/pingcap/parser/ast" + "github.com/pingcap/parser/model" + "github.com/pingcap/parser/mysql" + filter "github.com/pingcap/tidb-tools/pkg/table-filter" + "github.com/pingcap/tidb/ddl" + tmock "github.com/pingcap/tidb/util/mock" + "github.com/pingcap/br/pkg/lightning/backend" "github.com/pingcap/br/pkg/lightning/backend/importer" "github.com/pingcap/br/pkg/lightning/backend/kv" @@ -41,17 +53,6 @@ import ( "github.com/pingcap/br/pkg/mock" "github.com/pingcap/br/pkg/storage" "github.com/pingcap/br/pkg/version/build" - . "github.com/pingcap/check" - "github.com/pingcap/errors" - "github.com/pingcap/failpoint" - "github.com/pingcap/kvproto/pkg/import_kvpb" - "github.com/pingcap/parser" - "github.com/pingcap/parser/ast" - "github.com/pingcap/parser/model" - "github.com/pingcap/parser/mysql" - filter "github.com/pingcap/tidb-tools/pkg/table-filter" - "github.com/pingcap/tidb/ddl" - tmock "github.com/pingcap/tidb/util/mock" ) var _ = Suite(&restoreSuite{}) From a352c3a85bd0594ca4029e688f6c4d723cb43ec1 Mon Sep 17 00:00:00 2001 From: glorv Date: Fri, 4 Jun 2021 11:54:31 +0800 Subject: [PATCH 32/32] add copy right header --- pkg/lightning/restore/meta_manager.go | 2 ++ pkg/lightning/restore/meta_manager_test.go | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pkg/lightning/restore/meta_manager.go b/pkg/lightning/restore/meta_manager.go index 50fe0640c..bbef6fa6e 100644 --- a/pkg/lightning/restore/meta_manager.go +++ b/pkg/lightning/restore/meta_manager.go @@ -1,3 +1,5 @@ +// Copyright 2021 PingCAP, Inc. Licensed under Apache-2.0. + package restore import ( diff --git a/pkg/lightning/restore/meta_manager_test.go b/pkg/lightning/restore/meta_manager_test.go index bfe35106e..bf2fcba38 100644 --- a/pkg/lightning/restore/meta_manager_test.go +++ b/pkg/lightning/restore/meta_manager_test.go @@ -1,3 +1,5 @@ +// Copyright 2021 PingCAP, Inc. Licensed under Apache-2.0. + package restore import (