From 2e046c099bc076f9c31ad5c2ddd7ceddbb46769b Mon Sep 17 00:00:00 2001 From: Chunzhu Li Date: Mon, 7 Nov 2022 16:59:37 +0800 Subject: [PATCH 1/4] support checkpoint read for compress files --- br/pkg/lightning/mydump/csv_parser.go | 5 + br/pkg/lightning/mydump/parquet_parser.go | 6 + br/pkg/lightning/mydump/parser.go | 26 ++++ br/pkg/lightning/mydump/router.go | 16 +++ .../lightning/restore/chunk_restore_test.go | 124 ++++++++++++++++++ br/pkg/lightning/restore/restore.go | 29 +++- br/pkg/storage/writer.go | 20 +-- 7 files changed, 211 insertions(+), 15 deletions(-) diff --git a/br/pkg/lightning/mydump/csv_parser.go b/br/pkg/lightning/mydump/csv_parser.go index 96de51bd49c73..f1a13389ba7b7 100644 --- a/br/pkg/lightning/mydump/csv_parser.go +++ b/br/pkg/lightning/mydump/csv_parser.go @@ -573,3 +573,8 @@ func (parser *CSVParser) ReadUntilTerminator() (int64, error) { } } } + +// SetRowID sets the rowID in a csv file when we start a compressed file. +func (parser *CSVParser) SetRowID(rowID int64) { + parser.lastRow.RowID = rowID +} diff --git a/br/pkg/lightning/mydump/parquet_parser.go b/br/pkg/lightning/mydump/parquet_parser.go index 37f193666492f..e7ac2baa6d80f 100644 --- a/br/pkg/lightning/mydump/parquet_parser.go +++ b/br/pkg/lightning/mydump/parquet_parser.go @@ -579,6 +579,12 @@ func (pp *ParquetParser) SetLogger(l log.Logger) { pp.logger = l } +// SetRowID sets the rowID in a parquet file when we start a compressed file. +// It implements the Parser interface. +func (pp *ParquetParser) SetRowID(rowID int64) { + pp.lastRow.RowID = rowID +} + func jdToTime(jd int32, nsec int64) time.Time { sec := int64(jd-jan011970) * secPerDay // it's fine not to check the value of nsec diff --git a/br/pkg/lightning/mydump/parser.go b/br/pkg/lightning/mydump/parser.go index 1560dd4c14a44..75dae1d1bfb2f 100644 --- a/br/pkg/lightning/mydump/parser.go +++ b/br/pkg/lightning/mydump/parser.go @@ -138,6 +138,8 @@ type Parser interface { SetColumns([]string) SetLogger(log.Logger) + + SetRowID(rowID int64) } // NewChunkParser creates a new parser which can read chunks out of a file. @@ -205,6 +207,11 @@ func (parser *blockParser) SetLogger(logger log.Logger) { parser.Logger = logger } +// SetRowID changes the reported row ID when we firstly read compressed files. +func (parser *blockParser) SetRowID(rowID int64) { + parser.lastRow.RowID = rowID +} + type token byte const ( @@ -592,3 +599,22 @@ func ReadChunks(parser Parser, minSize int64) ([]Chunk, error) { } } } + +// ReadUntil parses the entire file and splits it into continuous chunks of +// size >= minSize. +func ReadUntil(parser Parser, pos int64) error { + var curOffset int64 + for curOffset < pos { + switch err := parser.ReadRow(); errors.Cause(err) { + case nil: + curOffset, _ = parser.Pos() + + case io.EOF: + return nil + + default: + return errors.Trace(err) + } + } + return nil +} diff --git a/br/pkg/lightning/mydump/router.go b/br/pkg/lightning/mydump/router.go index 2ed3512edc96e..9d70431a14b07 100644 --- a/br/pkg/lightning/mydump/router.go +++ b/br/pkg/lightning/mydump/router.go @@ -9,6 +9,7 @@ import ( "github.com/pingcap/errors" "github.com/pingcap/tidb/br/pkg/lightning/config" "github.com/pingcap/tidb/br/pkg/lightning/log" + "github.com/pingcap/tidb/br/pkg/storage" "github.com/pingcap/tidb/util/filter" "github.com/pingcap/tidb/util/slice" "go.uber.org/zap" @@ -71,6 +72,21 @@ const ( CompressionSnappy ) +func ToStorageCompressType(compression Compression) (storage.CompressType, error) { + switch compression { + case CompressionGZ: + return storage.Gzip, nil + case CompressionSnappy: + return storage.Snappy, nil + case CompressionZStd: + return storage.Zstd, nil + case CompressionNone: + return storage.NoCompression, nil + default: + return storage.NoCompression, errors.Errorf("compression %s doesn't have related storage compressType", compression) + } +} + func parseSourceType(t string) (SourceType, error) { switch strings.ToLower(strings.TrimSpace(t)) { case SchemaSchema: diff --git a/br/pkg/lightning/restore/chunk_restore_test.go b/br/pkg/lightning/restore/chunk_restore_test.go index 185cc5b4219ca..f15c515aa5658 100644 --- a/br/pkg/lightning/restore/chunk_restore_test.go +++ b/br/pkg/lightning/restore/chunk_restore_test.go @@ -15,9 +15,13 @@ package restore import ( + "compress/gzip" "context" + "fmt" + "io" "os" "path/filepath" + "strconv" "sync" "testing" @@ -40,8 +44,10 @@ import ( "github.com/pingcap/tidb/parser" "github.com/pingcap/tidb/parser/ast" "github.com/pingcap/tidb/parser/model" + "github.com/pingcap/tidb/parser/mysql" "github.com/pingcap/tidb/types" tmock "github.com/pingcap/tidb/util/mock" + filter "github.com/pingcap/tidb/util/table-filter" "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" ) @@ -654,3 +660,121 @@ func (s *chunkRestoreSuite) TestRestore() { require.NoError(s.T(), err) require.Len(s.T(), saveCpCh, 2) } + +func TestCompressChunkRestore(t *testing.T) { + // Produce a mock table info + p := parser.New() + p.SetSQLMode(mysql.ModeANSIQuotes) + se := tmock.NewContext() + node, err := p.ParseOneStmt(` + CREATE TABLE "table" ( + a INT, + b INT, + c INT, + KEY (b) + ) +`, "", "") + require.NoError(t, err) + core, err := ddl.MockTableInfo(se, node.(*ast.CreateTableStmt), 0xabcdef) + require.NoError(t, err) + core.State = model.StatePublic + + // Write some sample CSV dump + fakeDataDir := t.TempDir() + store, err := storage.NewLocalStorage(fakeDataDir) + require.NoError(t, err) + + fakeDataFiles := make([]mydump.FileInfo, 0) + + csvName := "db.table.1.csv.gz" + file, err := os.Create(filepath.Join(fakeDataDir, csvName)) + require.NoError(t, err) + gzWriter := gzip.NewWriter(file) + + var totalBytes int64 + for i := 0; i < 300; i += 3 { + n, err := gzWriter.Write([]byte(fmt.Sprintf("%d,%d,%d\r\n", i, i+1, i+2))) + require.NoError(t, err) + totalBytes += int64(n) + } + + err = gzWriter.Close() + require.NoError(t, err) + err = file.Close() + require.NoError(t, err) + + fakeDataFiles = append(fakeDataFiles, mydump.FileInfo{ + TableName: filter.Table{Schema: "db", Name: "table"}, + FileMeta: mydump.SourceFileMeta{ + Path: csvName, + Type: mydump.SourceTypeCSV, + Compression: mydump.CompressionGZ, + SortKey: "99", + FileSize: totalBytes, + }, + }) + + chunk := checkpoints.ChunkCheckpoint{ + Key: checkpoints.ChunkCheckpointKey{Path: fakeDataFiles[0].FileMeta.Path, Offset: 0}, + FileMeta: fakeDataFiles[0].FileMeta, + Chunk: mydump.Chunk{ + Offset: 0, + EndOffset: totalBytes, + PrevRowIDMax: 0, + RowIDMax: 100, + }, + } + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + w := worker.NewPool(ctx, 5, "io") + cfg := config.NewConfig() + cfg.Mydumper.BatchSize = 111 + cfg.App.TableConcurrency = 2 + cfg.Mydumper.CSV.Header = false + + cr, err := newChunkRestore(ctx, 1, cfg, &chunk, w, store, nil) + require.NoError(t, err) + var ( + id, lastID int + offset int64 = 0 + rowID int64 = 0 + ) + for id < 100 { + offset, rowID = cr.parser.Pos() + err = cr.parser.ReadRow() + require.NoError(t, err) + rowData := cr.parser.LastRow().Row + require.Len(t, rowData, 3) + lastID = id + for i := 0; id < 100 && i < 3; i++ { + require.Equal(t, strconv.Itoa(id), rowData[i].GetString()) + id++ + } + } + + // test read starting from compress files' middle + chunk = checkpoints.ChunkCheckpoint{ + Key: checkpoints.ChunkCheckpointKey{Path: fakeDataFiles[0].FileMeta.Path, Offset: offset}, + FileMeta: fakeDataFiles[0].FileMeta, + Chunk: mydump.Chunk{ + Offset: offset, + EndOffset: totalBytes, + PrevRowIDMax: rowID, + RowIDMax: 100, + }, + } + cr, err = newChunkRestore(ctx, 1, cfg, &chunk, w, store, nil) + require.NoError(t, err) + for id = lastID; id < 300; { + err = cr.parser.ReadRow() + require.NoError(t, err) + rowData := cr.parser.LastRow().Row + require.Len(t, rowData, 3) + for i := 0; id < 300 && i < 3; i++ { + require.Equal(t, strconv.Itoa(id), rowData[i].GetString()) + id++ + } + } + err = cr.parser.ReadRow() + require.Equal(t, io.EOF, errors.Cause(err)) +} diff --git a/br/pkg/lightning/restore/restore.go b/br/pkg/lightning/restore/restore.go index 81894611c410a..0a0e05b45ac5d 100644 --- a/br/pkg/lightning/restore/restore.go +++ b/br/pkg/lightning/restore/restore.go @@ -2190,11 +2190,21 @@ func newChunkRestore( ) (*chunkRestore, error) { blockBufSize := int64(cfg.Mydumper.ReadBlockSize) - var reader storage.ReadSeekCloser - var err error - if chunk.FileMeta.Type == mydump.SourceTypeParquet { + var ( + reader storage.ReadSeekCloser + compressType storage.CompressType + err error + ) + switch { + case chunk.FileMeta.Type == mydump.SourceTypeParquet: reader, err = mydump.OpenParquetReader(ctx, store, chunk.FileMeta.Path, chunk.FileMeta.FileSize) - } else { + case chunk.FileMeta.Compression != mydump.CompressionNone: + compressType, err = mydump.ToStorageCompressType(chunk.FileMeta.Compression) + if err != nil { + break + } + reader, err = storage.WithCompression(store, compressType).Open(ctx, chunk.FileMeta.Path) + default: reader, err = store.Open(ctx, chunk.FileMeta.Path) } if err != nil { @@ -2225,8 +2235,15 @@ func newChunkRestore( panic(fmt.Sprintf("file '%s' with unknown source type '%s'", chunk.Key.Path, chunk.FileMeta.Type.String())) } - if err = parser.SetPos(chunk.Chunk.Offset, chunk.Chunk.PrevRowIDMax); err != nil { - return nil, errors.Trace(err) + if chunk.FileMeta.Compression == mydump.CompressionNone { + if err = parser.SetPos(chunk.Chunk.Offset, chunk.Chunk.PrevRowIDMax); err != nil { + return nil, errors.Trace(err) + } + } else { + if err = mydump.ReadUntil(parser, chunk.Chunk.Offset); err != nil { + return nil, errors.Trace(err) + } + parser.SetRowID(chunk.Chunk.PrevRowIDMax) } if len(chunk.ColumnPermutation) > 0 { parser.SetColumns(getColumnNames(tableInfo.Core, chunk.ColumnPermutation)) diff --git a/br/pkg/storage/writer.go b/br/pkg/storage/writer.go index 72d0e6dc61f4f..f61d30fa530d9 100644 --- a/br/pkg/storage/writer.go +++ b/br/pkg/storage/writer.go @@ -48,16 +48,18 @@ type interceptBuffer interface { } func createSuffixString(compressType CompressType) string { - if compressType == Gzip { - return ".txt.gz" - } - if compressType == Snappy { - return ".txt.snappy" - } - if compressType == Zstd { - return ".txt.zst" + txtSuffix := ".txt" + switch compressType { + case Gzip: + txtSuffix += ".gz" + case Snappy: + txtSuffix += ".snappy" + case Zstd: + txtSuffix += ".zst" + default: + return "" } - return "" + return txtSuffix } func newInterceptBuffer(chunkSize int, compressType CompressType) interceptBuffer { From aa187e5f5971bc91f0b733f1e9228852cd8253f5 Mon Sep 17 00:00:00 2001 From: Chunzhu Li Date: Mon, 7 Nov 2022 17:31:17 +0800 Subject: [PATCH 2/4] check rowID --- br/pkg/lightning/restore/chunk_restore_test.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/br/pkg/lightning/restore/chunk_restore_test.go b/br/pkg/lightning/restore/chunk_restore_test.go index f15c515aa5658..3cf02f05992ea 100644 --- a/br/pkg/lightning/restore/chunk_restore_test.go +++ b/br/pkg/lightning/restore/chunk_restore_test.go @@ -751,6 +751,7 @@ func TestCompressChunkRestore(t *testing.T) { id++ } } + require.Equal(t, int64(33), rowID) // test read starting from compress files' middle chunk = checkpoints.ChunkCheckpoint{ @@ -775,6 +776,8 @@ func TestCompressChunkRestore(t *testing.T) { id++ } } + _, rowID = cr.parser.Pos() + require.Equal(t, int64(100), rowID) err = cr.parser.ReadRow() require.Equal(t, io.EOF, errors.Cause(err)) } From 4af5ee6641a324e19c7e2b16457d35aaff2d97f3 Mon Sep 17 00:00:00 2001 From: Chunzhu Li Date: Tue, 8 Nov 2022 17:31:11 +0800 Subject: [PATCH 3/4] update --- br/pkg/lightning/mydump/router.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/br/pkg/lightning/mydump/router.go b/br/pkg/lightning/mydump/router.go index 9d70431a14b07..bdc2a922f12f7 100644 --- a/br/pkg/lightning/mydump/router.go +++ b/br/pkg/lightning/mydump/router.go @@ -72,6 +72,7 @@ const ( CompressionSnappy ) +// ToStorageCompressType converts Compression to storage.CompressType. func ToStorageCompressType(compression Compression) (storage.CompressType, error) { switch compression { case CompressionGZ: @@ -83,7 +84,7 @@ func ToStorageCompressType(compression Compression) (storage.CompressType, error case CompressionNone: return storage.NoCompression, nil default: - return storage.NoCompression, errors.Errorf("compression %s doesn't have related storage compressType", compression) + return storage.NoCompression, errors.Errorf("compression %d doesn't have related storage compressType", compression) } } From fb59b88a34cf3409280b6045aa58fbf38fa45818 Mon Sep 17 00:00:00 2001 From: Chunzhu Li Date: Thu, 10 Nov 2022 20:05:21 +0800 Subject: [PATCH 4/4] address comments --- br/pkg/lightning/mydump/csv_parser.go | 5 ----- br/pkg/lightning/mydump/parser.go | 1 + br/pkg/lightning/restore/chunk_restore_test.go | 3 +-- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/br/pkg/lightning/mydump/csv_parser.go b/br/pkg/lightning/mydump/csv_parser.go index f1a13389ba7b7..96de51bd49c73 100644 --- a/br/pkg/lightning/mydump/csv_parser.go +++ b/br/pkg/lightning/mydump/csv_parser.go @@ -573,8 +573,3 @@ func (parser *CSVParser) ReadUntilTerminator() (int64, error) { } } } - -// SetRowID sets the rowID in a csv file when we start a compressed file. -func (parser *CSVParser) SetRowID(rowID int64) { - parser.lastRow.RowID = rowID -} diff --git a/br/pkg/lightning/mydump/parser.go b/br/pkg/lightning/mydump/parser.go index 75dae1d1bfb2f..73f84424bf5e3 100644 --- a/br/pkg/lightning/mydump/parser.go +++ b/br/pkg/lightning/mydump/parser.go @@ -176,6 +176,7 @@ func (parser *blockParser) SetPos(pos int64, rowID int64) error { } // Pos returns the current file offset. +// Attention: for compressed sql/csv files, pos is the position in uncompressed files func (parser *blockParser) Pos() (pos int64, lastRowID int64) { return parser.pos, parser.lastRow.RowID } diff --git a/br/pkg/lightning/restore/chunk_restore_test.go b/br/pkg/lightning/restore/chunk_restore_test.go index 3cf02f05992ea..452e82821c9fa 100644 --- a/br/pkg/lightning/restore/chunk_restore_test.go +++ b/br/pkg/lightning/restore/chunk_restore_test.go @@ -665,7 +665,6 @@ func TestCompressChunkRestore(t *testing.T) { // Produce a mock table info p := parser.New() p.SetSQLMode(mysql.ModeANSIQuotes) - se := tmock.NewContext() node, err := p.ParseOneStmt(` CREATE TABLE "table" ( a INT, @@ -675,7 +674,7 @@ func TestCompressChunkRestore(t *testing.T) { ) `, "", "") require.NoError(t, err) - core, err := ddl.MockTableInfo(se, node.(*ast.CreateTableStmt), 0xabcdef) + core, err := ddl.BuildTableInfoFromAST(node.(*ast.CreateTableStmt)) require.NoError(t, err) core.State = model.StatePublic