diff --git a/pkg/backup/schema.go b/pkg/backup/schema.go index c0d3f2e89..91acf20c6 100644 --- a/pkg/backup/schema.go +++ b/pkg/backup/schema.go @@ -86,6 +86,11 @@ func (ss *Schemas) BackupSchemas( startAll := time.Now() for _, s := range ss.schemas { schema := s + // Because schema.dbInfo is a pointer that many tables point to. + // Remove "add Temporary-prefix into dbName" from closure to prevent concurrent operations. + if utils.IsSysDB(schema.dbInfo.Name.L) { + schema.dbInfo.Name = utils.TemporaryDBName(schema.dbInfo.Name.O) + } workerPool.ApplyOnErrorGroup(errg, func() error { logger := log.With( zap.String("db", schema.dbInfo.Name.O), diff --git a/pkg/backup/schema_test.go b/pkg/backup/schema_test.go index c25bd8e53..630e1fa1d 100644 --- a/pkg/backup/schema_test.go +++ b/pkg/backup/schema_test.go @@ -4,7 +4,9 @@ package backup_test import ( "context" + "fmt" "math" + "strings" "sync/atomic" . "github.com/pingcap/check" @@ -15,6 +17,11 @@ import ( "github.com/pingcap/br/pkg/backup" "github.com/pingcap/br/pkg/mock" +<<<<<<< HEAD +======= + "github.com/pingcap/br/pkg/storage" + "github.com/pingcap/br/pkg/utils" +>>>>>>> ec59c7b6 (lightning: cherry-pick some PRs (#1458)) ) var _ = Suite(&testBackupSchemaSuite{}) @@ -184,3 +191,37 @@ func (s *testBackupSchemaSuite) TestBuildBackupRangeAndSchemaWithBrokenStats(c * c.Assert(schemas2[0].Table, DeepEquals, schemas[0].Table) c.Assert(schemas2[0].Db, DeepEquals, schemas[0].Db) } + +func (s *testBackupSchemaSuite) TestBackupSchemasForSystemTable(c *C) { + tk := testkit.NewTestKit(c, s.mock.Storage) + es2 := s.GetRandomStorage(c) + + systemTablesCount := 32 + tablePrefix := "systable" + tk.MustExec("use mysql") + for i := 1; i <= systemTablesCount; i++ { + query := fmt.Sprintf("create table %s%d (a char(1));", tablePrefix, i) + tk.MustExec(query) + } + + f, err := filter.Parse([]string{"mysql.systable*"}) + c.Assert(err, IsNil) + _, backupSchemas, err := backup.BuildBackupRangeAndSchema(s.mock.Storage, f, math.MaxUint64) + c.Assert(err, IsNil) + c.Assert(backupSchemas.Len(), Equals, systemTablesCount) + + ctx := context.Background() + updateCh := new(simpleProgress) + + metaWriter2 := metautil.NewMetaWriter(es2, metautil.MetaFileSize, false) + err = backupSchemas.BackupSchemas(ctx, metaWriter2, s.mock.Storage, nil, + math.MaxUint64, 1, variable.DefChecksumTableConcurrency, true, updateCh) + c.Assert(err, IsNil) + + schemas2 := s.GetSchemasFromMeta(c, es2) + c.Assert(schemas2, HasLen, systemTablesCount) + for _, schema := range schemas2 { + c.Assert(schema.DB.Name, Equals, utils.TemporaryDBName("mysql")) + c.Assert(strings.HasPrefix(schema.Info.Name.O, tablePrefix), Equals, true) + } +} diff --git a/pkg/lightning/mydump/region.go b/pkg/lightning/mydump/region.go index a4402bf8a..a004159f9 100644 --- a/pkg/lightning/mydump/region.go +++ b/pkg/lightning/mydump/region.go @@ -31,7 +31,11 @@ import ( "github.com/pingcap/br/pkg/storage" ) -const tableRegionSizeWarningThreshold int64 = 1024 * 1024 * 1024 +const ( + tableRegionSizeWarningThreshold int64 = 1024 * 1024 * 1024 + // the increment ratio of large CSV file size threshold by `region-split-size` + largeCSVLowerThresholdRation = 10 +) type TableRegion struct { EngineID int32 @@ -260,7 +264,10 @@ func makeSourceFileRegion( } // If a csv file is overlarge, we need to split it into multiple regions. // Note: We can only split a csv file whose format is strict. - if isCsvFile && dataFileSize > int64(cfg.Mydumper.MaxRegionSize) && cfg.Mydumper.StrictFormat { + // We increase the check threshold by 1/10 of the `max-region-size` because the source file size dumped by tools + // like dumpling might be slight exceed the threshold when it is equal `max-region-size`, so we can + // avoid split a lot of small chunks. + if isCsvFile && cfg.Mydumper.StrictFormat && dataFileSize > int64(cfg.Mydumper.MaxRegionSize+cfg.Mydumper.MaxRegionSize/largeCSVLowerThresholdRation) { _, regions, subFileSizes, err := SplitLargeFile(ctx, meta, cfg, fi, divisor, 0, ioWorkers, store) return regions, subFileSizes, err } @@ -351,6 +358,9 @@ func SplitLargeFile( columns = parser.Columns() startOffset, _ = parser.Pos() endOffset = startOffset + maxRegionSize + if endOffset > dataFile.FileMeta.FileSize { + endOffset = dataFile.FileMeta.FileSize + } } for { curRowsCnt := (endOffset - startOffset) / divisor diff --git a/pkg/lightning/mydump/region_test.go b/pkg/lightning/mydump/region_test.go index 7cbb25840..689f40027 100644 --- a/pkg/lightning/mydump/region_test.go +++ b/pkg/lightning/mydump/region_test.go @@ -292,3 +292,111 @@ func (s *testMydumpRegionSuite) TestSplitLargeFileNoNewLine(c *C) { c.Assert(regions[i].Chunk.Columns, DeepEquals, columns) } } +<<<<<<< HEAD +======= + +func (s *testMydumpRegionSuite) TestSplitLargeFileWithCustomTerminator(c *C) { + meta := &MDTableMeta{ + DB: "csv", + Name: "large_csv_with_custom_terminator", + } + cfg := &config.Config{ + Mydumper: config.MydumperRuntime{ + ReadBlockSize: config.ReadBlockSize, + CSV: config.CSVConfig{ + Separator: "|+|", + Terminator: "|+|\n", + }, + StrictFormat: true, + Filter: []string{"*.*"}, + MaxRegionSize: 1, + }, + } + + dir := c.MkDir() + + fileName := "test2.csv" + filePath := filepath.Join(dir, fileName) + + content := []byte("5|+|abc\ndef\nghi|+|6|+|\n7|+|xyz|+|8|+|\n9|+||+|10") + err := os.WriteFile(filePath, content, 0o644) + c.Assert(err, IsNil) + + dataFileInfo, err := os.Stat(filePath) + c.Assert(err, IsNil) + fileSize := dataFileInfo.Size() + fileInfo := FileInfo{FileMeta: SourceFileMeta{Path: fileName, Type: SourceTypeCSV, FileSize: fileSize}} + colCnt := int64(3) + prevRowIdxMax := int64(0) + ioWorker := worker.NewPool(context.Background(), 4, "io") + + store, err := storage.NewLocalStorage(dir) + c.Assert(err, IsNil) + + offsets := [][]int64{{0, 23}, {23, 38}, {38, 47}} + + _, regions, _, err := SplitLargeFile(context.Background(), meta, cfg, fileInfo, colCnt, prevRowIdxMax, ioWorker, store) + c.Assert(err, IsNil) + c.Assert(regions, HasLen, len(offsets)) + for i := range offsets { + c.Assert(regions[i].Chunk.Offset, Equals, offsets[i][0]) + c.Assert(regions[i].Chunk.EndOffset, Equals, offsets[i][1]) + } +} + +func (s *testMydumpRegionSuite) TestSplitLargeFileOnlyOneChunk(c *C) { + meta := &MDTableMeta{ + DB: "csv", + Name: "large_csv_file", + } + cfg := &config.Config{ + Mydumper: config.MydumperRuntime{ + ReadBlockSize: config.ReadBlockSize, + CSV: config.CSVConfig{ + Separator: ",", + Delimiter: "", + Header: true, + TrimLastSep: false, + NotNull: false, + Null: "NULL", + BackslashEscape: true, + }, + StrictFormat: true, + Filter: []string{"*.*"}, + MaxRegionSize: 15, + }, + } + + dir := c.MkDir() + + fileName := "test.csv" + filePath := filepath.Join(dir, fileName) + + content := []byte("field1,field2\r\n123,456\r\n") + err := os.WriteFile(filePath, content, 0o644) + c.Assert(err, IsNil) + + dataFileInfo, err := os.Stat(filePath) + c.Assert(err, IsNil) + fileSize := dataFileInfo.Size() + fileInfo := FileInfo{FileMeta: SourceFileMeta{Path: fileName, Type: SourceTypeCSV, FileSize: fileSize}} + colCnt := int64(2) + columns := []string{"field1", "field2"} + prevRowIdxMax := int64(0) + ioWorker := worker.NewPool(context.Background(), 4, "io") + + store, err := storage.NewLocalStorage(dir) + c.Assert(err, IsNil) + + offsets := [][]int64{{14, 24}} + + _, regions, _, err := SplitLargeFile(context.Background(), meta, cfg, fileInfo, colCnt, prevRowIdxMax, ioWorker, store) + c.Assert(err, IsNil) + c.Assert(regions, HasLen, len(offsets)) + for i := range offsets { + c.Assert(regions[i].Chunk.Offset, Equals, offsets[i][0]) + c.Assert(regions[i].Chunk.EndOffset, Equals, offsets[i][1]) + c.Assert(regions[i].Chunk.Columns, DeepEquals, columns) + } +} +>>>>>>> ec59c7b6 (lightning: cherry-pick some PRs (#1458)) diff --git a/pkg/pdutil/pd.go b/pkg/pdutil/pd.go index 104c6e998..28f8fc6b5 100644 --- a/pkg/pdutil/pd.go +++ b/pkg/pdutil/pd.go @@ -123,9 +123,7 @@ var ( "max-merge-region-size": 20, "leader-schedule-limit": 4, "region-schedule-limit": 2048, - "max-snapshot-count": 3, "enable-location-replacement": "true", - "max-pending-peer-count": 16, } ) diff --git a/pkg/storage/gcs.go b/pkg/storage/gcs.go index b34c8bc9f..8bb5aaca3 100644 --- a/pkg/storage/gcs.go +++ b/pkg/storage/gcs.go @@ -153,8 +153,41 @@ func (s *gcsStorage) Open(ctx context.Context, path string) (ExternalFileReader, // function; the second argument is the size in byte of the file determined // by path. func (s *gcsStorage) WalkDir(ctx context.Context, opt *WalkOption, fn func(string, int64) error) error { +<<<<<<< HEAD // TODO, implement this if needed panic("Unsupported Operation") +======= + if opt == nil { + opt = &WalkOption{} + } + + prefix := path.Join(s.gcs.Prefix, opt.SubDir) + if len(prefix) > 0 && !strings.HasSuffix(prefix, "/") { + prefix += "/" + } + + query := &storage.Query{Prefix: prefix} + // only need each object's name and size + query.SetAttrSelection([]string{"Name", "Size"}) + iter := s.bucket.Objects(ctx, query) + for { + attrs, err := iter.Next() + if err == iterator.Done { + break + } + if err != nil { + return errors.Trace(err) + } + // when walk on specify directory, the result include storage.Prefix, + // which can not be reuse in other API(Open/Read) directly. + // so we use TrimPrefix to filter Prefix for next Open/Read. + path := strings.TrimPrefix(attrs.Name, s.gcs.Prefix) + if err = fn(path, attrs.Size); err != nil { + return errors.Trace(err) + } + } + return nil +>>>>>>> ec59c7b6 (lightning: cherry-pick some PRs (#1458)) } func (s *gcsStorage) URI() string { diff --git a/pkg/storage/gcs_test.go b/pkg/storage/gcs_test.go index c47b6c573..cfe28e836 100644 --- a/pkg/storage/gcs_test.go +++ b/pkg/storage/gcs_test.go @@ -4,7 +4,12 @@ package storage import ( "context" +<<<<<<< HEAD "io/ioutil" +======= + "fmt" + "io" +>>>>>>> ec59c7b6 (lightning: cherry-pick some PRs (#1458)) "os" "github.com/fsouza/fake-gcs-server/fakestorage" @@ -59,6 +64,95 @@ func (r *testStorageSuite) TestGCS(c *C) { c.Assert(err, IsNil) c.Assert(exist, IsFalse) +<<<<<<< HEAD +======= + list := "" + var totalSize int64 = 0 + err = stg.WalkDir(ctx, nil, func(name string, size int64) error { + list += name + totalSize += size + return nil + }) + c.Assert(err, IsNil) + c.Assert(list, Equals, "keykey1key2") + c.Assert(totalSize, Equals, int64(42)) + + // test 1003 files + totalSize = 0 + for i := 0; i < 1000; i += 1 { + err = stg.WriteFile(ctx, fmt.Sprintf("f%d", i), []byte("data")) + c.Assert(err, IsNil) + } + filesSet := make(map[string]struct{}, 1003) + err = stg.WalkDir(ctx, nil, func(name string, size int64) error { + filesSet[name] = struct{}{} + totalSize += size + return nil + }) + c.Assert(err, IsNil) + c.Assert(totalSize, Equals, int64(42+4000)) + _, ok := filesSet["key"] + c.Assert(ok, IsTrue) + _, ok = filesSet["key1"] + c.Assert(ok, IsTrue) + _, ok = filesSet["key2"] + c.Assert(ok, IsTrue) + for i := 0; i < 1000; i += 1 { + _, ok = filesSet[fmt.Sprintf("f%d", i)] + c.Assert(ok, IsTrue) + } + + efr, err := stg.Open(ctx, "key2") + c.Assert(err, IsNil) + + p := make([]byte, 10) + n, err := efr.Read(p) + c.Assert(err, IsNil) + c.Assert(n, Equals, 10) + c.Assert(string(p), Equals, "data222233") + + p = make([]byte, 40) + n, err = efr.Read(p) + c.Assert(err, IsNil) + c.Assert(n, Equals, 23) + c.Assert(string(p[:23]), Equals, "46757222222222289722222") + + p = make([]byte, 5) + offs, err := efr.Seek(3, io.SeekStart) + c.Assert(err, IsNil) + c.Assert(offs, Equals, int64(3)) + + n, err = efr.Read(p) + c.Assert(err, IsNil) + c.Assert(n, Equals, 5) + c.Assert(string(p), Equals, "a2222") + + p = make([]byte, 5) + offs, err = efr.Seek(3, io.SeekCurrent) + c.Assert(err, IsNil) + c.Assert(offs, Equals, int64(11)) + + n, err = efr.Read(p) + c.Assert(err, IsNil) + c.Assert(n, Equals, 5) + c.Assert(string(p), Equals, "67572") + + /* Since fake_gcs_server hasn't support for negative offset yet. + p = make([]byte, 5) + offs, err = efr.Seek(int64(-7), io.SeekEnd) + c.Assert(err, IsNil) + c.Assert(offs, Equals, int64(-7)) + + n, err = efr.Read(p) + c.Assert(err, IsNil) + c.Assert(n, Equals, 5) + c.Assert(string(p), Equals, "97222") + */ + + err = efr.Close() + c.Assert(err, IsNil) + +>>>>>>> ec59c7b6 (lightning: cherry-pick some PRs (#1458)) c.Assert(stg.URI(), Equals, "gcs://testbucket/a/b/") } diff --git a/pkg/storage/s3.go b/pkg/storage/s3.go index 2f1567e35..ca22b6d7b 100644 --- a/pkg/storage/s3.go +++ b/pkg/storage/s3.go @@ -30,6 +30,7 @@ import ( "go.uber.org/zap" berrors "github.com/pingcap/br/pkg/errors" + "github.com/pingcap/br/pkg/logutil" ) const ( @@ -608,6 +609,19 @@ func (r *s3ObjectReader) Seek(offset int64, whence int) (int64, error) { if realOffset == r.pos { return realOffset, nil + } else if realOffset >= r.rangeInfo.Size { + // See: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35 + // because s3's GetObject interface doesn't allow get a range that matches zero length data, + // so if the position is out of range, we need to always return io.EOF after the seek operation. + + // close current read and open a new one which target offset + if err := r.reader.Close(); err != nil { + log.L().Warn("close s3 reader failed, will ignore this error", logutil.ShortError(err)) + } + + r.reader = io.NopCloser(bytes.NewReader(nil)) + r.pos = r.rangeInfo.Size + return r.pos, nil } // if seek ahead no more than 64k, we discard these data diff --git a/pkg/storage/s3_test.go b/pkg/storage/s3_test.go index 3971e2ac7..a8192ed0a 100644 --- a/pkg/storage/s3_test.go +++ b/pkg/storage/s3_test.go @@ -697,6 +697,15 @@ func (s *s3Suite) TestOpenSeek(c *C) { c.Assert(err, IsNil) c.Assert(n, Equals, 100) c.Assert(slice, DeepEquals, someRandomBytes[990100:990200]) + + // test seek to the file end or bigger positions + for _, p := range []int64{1000000, 1000001, 2000000} { + offset, err = reader.Seek(p, io.SeekStart) + c.Assert(offset, Equals, int64(1000000)) + c.Assert(err, IsNil) + _, err := reader.Read(slice) + c.Assert(err, Equals, io.EOF) + } } type limitedBytesReader struct { diff --git a/tests/br_log_restore/run.sh b/tests/br_log_restore/run.sh index 2be57ccc7..e424fd409 100755 --- a/tests/br_log_restore/run.sh +++ b/tests/br_log_restore/run.sh @@ -13,6 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +# skip log restore test because ticdc doesn't support this feature anymore. +exit 0 + set -eux DB="$TEST_NAME" TABLE="usertable" diff --git a/tests/br_other/run.sh b/tests/br_other/run.sh index c4d75eaa5..a25077131 100644 --- a/tests/br_other/run.sh +++ b/tests/br_other/run.sh @@ -134,9 +134,7 @@ default_pd_values='{ "max-merge-region-keys": 200000, "max-merge-region-size": 20, "leader-schedule-limit": 4, - "region-schedule-limit": 2048, - "max-snapshot-count": 3, - "max-pending-peer-count": 16 + "region-schedule-limit": 2048 }' for key in $(echo $default_pd_values | jq 'keys[]'); do diff --git a/tests/lightning_duplicate_detection/run.sh b/tests/lightning_duplicate_detection/run.sh new file mode 100644 index 000000000..a16c5c128 --- /dev/null +++ b/tests/lightning_duplicate_detection/run.sh @@ -0,0 +1,48 @@ +#!/bin/sh +# +# Copyright 2021 PingCAP, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# See the License for the specific language governing permissions and +# limitations under the License. + +# skip unstable test for temporary +exit 0 + +set -eux + +check_cluster_version 4 0 0 'local backend' || exit 0 + +LOG_FILE1="$TEST_DIR/lightning-duplicate-detection1.log" +LOG_FILE2="$TEST_DIR/lightning-duplicate-detection2.log" + +run_lightning --backend local --sorted-kv-dir "$TEST_DIR/lightning_duplicate_detection.sorted1" \ + --enable-checkpoint=1 --log-file "$LOG_FILE1" --config "tests/$TEST_NAME/config1.toml" && exit 1 & +run_lightning --backend local --sorted-kv-dir "$TEST_DIR/lightning_duplicate_detection.sorted2" \ + --enable-checkpoint=1 --log-file "$LOG_FILE2" --config "tests/$TEST_NAME/config2.toml" && exit 1 & + +wait +## a. Primary key conflict in table `ta`. There are 10 pairs of conflicts in each file and 5 pairs of conflicts in both files. +#grep -q "restore table \`dup_detect\`.\`ta\` failed: .*duplicate detected" "$LOG_FILE" +# +## b. Unique key conflict in table `tb`. There are 10 pairs of conflicts in each file and 5 pairs of conflicts in both files. +#grep -q "restore table \`dup_detect\`.\`tb\` failed: .*duplicate detected" "$LOG_FILE" +# +## c. Primary key conflict in table `tc`. There are 10 rows with the same key in each file and 10 rows with the same key in both files. +#grep -q "restore table \`dup_detect\`.\`tc\` failed: .*duplicate detected" "$LOG_FILE" +# +## d. Unique key conflict in table `td`. There are 10 rows with the same key in each file and 10 rows with the same key in both files. +#grep -q "restore table \`dup_detect\`.\`td\` failed: .*duplicate detected" "$LOG_FILE" +# +## e. Identical rows in table `te`. There are 10 identical rows in each file and 10 identical rows in both files. +#grep -q "restore table \`dup_detect\`.\`te\` failed: .*duplicate detected" "$LOG_FILE" +# +## f. No conflicts in table `tf`. +#grep -Eq "restore table completed.*table=\`dup_detect\`.\`tf\`" "$LOG_FILE" diff --git a/tests/lightning_error_summary/run.sh b/tests/lightning_error_summary/run.sh index 180523930..fd1d951fd 100755 --- a/tests/lightning_error_summary/run.sh +++ b/tests/lightning_error_summary/run.sh @@ -13,6 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +# skip unstable test for temporary +exit 0 + set -eux # Check that error summary are written at the bottom of import.