-
Notifications
You must be signed in to change notification settings - Fork 3.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
This change adds the file `parquet.go` which contains helper functions to help create parquet writers and export data via `cdcevent.Row` structs. This change also adds tests to ensure rows are written to parquet files correctly. Epic: None Release note: None
- Loading branch information
1 parent
631c398
commit b824800
Showing
5 changed files
with
266 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
// Copyright 2022 The Cockroach Authors. | ||
// | ||
// Licensed as a CockroachDB Enterprise file under the Cockroach Community | ||
// License (the "License"); you may not use this file except in compliance with | ||
// the License. You may obtain a copy of the License at | ||
// | ||
// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt | ||
|
||
package changefeedccl | ||
|
||
import ( | ||
"io" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/cdcevent" | ||
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree" | ||
"github.com/cockroachdb/cockroach/pkg/sql/types" | ||
"github.com/cockroachdb/cockroach/pkg/util/parquet" | ||
) | ||
|
||
// NewCDCParquetWriterFromRow constructs a new parquet writer which outputs to | ||
// the given sink. This function interprets the schema from the supplied row. | ||
func NewCDCParquetWriterFromRow( | ||
row cdcevent.Row, sink io.Writer, maxRowGroupSize int64, | ||
) (*parquet.Writer, error) { | ||
columnNames := make([]string, len(row.ResultColumns())+1) | ||
columnTypes := make([]*types.T, len(row.ResultColumns())+1) | ||
|
||
idx := 0 | ||
if err := row.ForEachColumn().Col(func(col cdcevent.ResultColumn) error { | ||
columnNames[idx] = col.Name | ||
columnTypes[idx] = col.Typ | ||
idx += 1 | ||
return nil | ||
}); err != nil { | ||
return nil, err | ||
} | ||
|
||
columnNames[idx] = parquetCrdbEventTypeColName | ||
columnTypes[idx] = types.String | ||
|
||
schemaDef, err := parquet.NewSchema(columnNames, columnTypes) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return parquet.NewWriter(schemaDef, sink, parquet.WithMaxRowGroupLength(maxRowGroupSize)) | ||
} | ||
|
||
// AddData writes the updatedRow to the writer, adding the row's event type. | ||
// If returnDatums is true, this method will return the array of datums | ||
// which were written. Otherwise, the returned array will be nil. | ||
func AddData( | ||
writer *parquet.Writer, updatedRow cdcevent.Row, prevRow cdcevent.Row, returnDatums bool, | ||
) ([]tree.Datum, error) { | ||
eventTypeDatum := getEventTypeDatum(updatedRow, prevRow).DString() | ||
var datumRow []tree.Datum | ||
addDataToReturn := func(d tree.Datum) { | ||
if returnDatums { | ||
if datumRow == nil { | ||
datumRow = []tree.Datum{} | ||
} | ||
datumRow = append(datumRow, d) | ||
} | ||
} | ||
|
||
rowWriter, err := writer.AddData() | ||
if err != nil { | ||
return nil, err | ||
} | ||
idx := 0 | ||
if err := updatedRow.ForEachColumn().Datum(func(d tree.Datum, _ cdcevent.ResultColumn) error { | ||
if err = rowWriter.WriteColumn(idx, d); err != nil { | ||
return err | ||
} | ||
addDataToReturn(d) | ||
idx += 1 | ||
return nil | ||
}); err != nil { | ||
return nil, err | ||
} | ||
if err = rowWriter.WriteColumn(idx, eventTypeDatum); err != nil { | ||
return nil, err | ||
} | ||
addDataToReturn(eventTypeDatum) | ||
return datumRow, err | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
// Copyright 2022 The Cockroach Authors. | ||
// | ||
// Licensed as a CockroachDB Enterprise file under the Cockroach Community | ||
// License (the "License"); you may not use this file except in compliance with | ||
// the License. You may obtain a copy of the License at | ||
// | ||
// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt | ||
|
||
package changefeedccl | ||
|
||
import ( | ||
"context" | ||
"os" | ||
"testing" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/base" | ||
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/cdcevent" | ||
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/cdctest" | ||
"github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase" | ||
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb" | ||
"github.com/cockroachdb/cockroach/pkg/kv/kvpb" | ||
"github.com/cockroachdb/cockroach/pkg/roachpb" | ||
"github.com/cockroachdb/cockroach/pkg/sql" | ||
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree" | ||
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils" | ||
"github.com/cockroachdb/cockroach/pkg/testutils/skip" | ||
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" | ||
"github.com/cockroachdb/cockroach/pkg/util/leaktest" | ||
"github.com/cockroachdb/cockroach/pkg/util/log" | ||
"github.com/cockroachdb/cockroach/pkg/util/parquet" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestParquetRows(t *testing.T) { | ||
defer leaktest.AfterTest(t)() | ||
defer log.Scope(t).Close(t) | ||
|
||
// Rangefeed reader can time out under stress. | ||
skip.UnderStress(t) | ||
|
||
ctx := context.Background() | ||
s, db, _ := serverutils.StartServer(t, base.TestServerArgs{ | ||
// TODO(#98816): cdctest.GetHydratedTableDescriptor does not work with tenant dbs. | ||
// Once it is fixed, this flag can be removed. | ||
DefaultTestTenant: base.TestTenantDisabled, | ||
}) | ||
defer s.Stopper().Stop(ctx) | ||
|
||
maxRowGroupSize := int64(2) | ||
|
||
sqlDB := sqlutils.MakeSQLRunner(db) | ||
|
||
for _, tc := range []struct { | ||
testName string | ||
createTable string | ||
inserts []string | ||
}{ | ||
{ | ||
testName: "mixed", | ||
createTable: `CREATE TABLE foo ( | ||
int32Col INT4 PRIMARY KEY, | ||
varCharCol VARCHAR(16) , | ||
charCol CHAR(2), | ||
tsCol TIMESTAMP , | ||
stringCol STRING , | ||
decimalCOl DECIMAL(12,2), | ||
uuidCol UUID | ||
)`, | ||
inserts: []string{ | ||
`INSERT INTO foo values (0, 'zero', 'CA', now(), 'oiwjfoijsdjif', 'inf', gen_random_uuid())`, | ||
`INSERT INTO foo values (1, 'one', 'NY', now(), 'sdi9fu90d', '-1.90', gen_random_uuid())`, | ||
`INSERT INTO foo values (2, 'two', 'WA', now(), 'sd9fid9fuj', '0.01', gen_random_uuid())`, | ||
`INSERT INTO foo values (3, 'three', 'ON', now(), 'sadklfhkdlsjf', '1.2', gen_random_uuid())`, | ||
`INSERT INTO foo values (4, 'four', 'NS', now(), '123123', '-11222221.2', gen_random_uuid())`, | ||
`INSERT INTO foo values (5, 'five', 'BC', now(), 'sadklfhkdlsjf', '1.2', gen_random_uuid())`, | ||
`INSERT INTO foo values (6, 'siz', 'AB', now(), '123123', '-11222221.2', gen_random_uuid())`, | ||
}, | ||
}, | ||
} { | ||
t.Run(tc.testName, func(t *testing.T) { | ||
sqlDB.Exec(t, tc.createTable) | ||
defer func() { | ||
sqlDB.Exec(t, "DROP TABLE foo") | ||
}() | ||
|
||
popRow, cleanup, decoder := makeRangefeedReaderAndDecoder(t, s) | ||
defer cleanup() | ||
|
||
fileName := "TestParquetRows" | ||
var writer *parquet.Writer | ||
var numCols int | ||
f, err := os.CreateTemp(os.TempDir(), fileName) | ||
require.NoError(t, err) | ||
|
||
numRows := len(tc.inserts) | ||
for _, insertStmt := range tc.inserts { | ||
sqlDB.Exec(t, insertStmt) | ||
} | ||
|
||
datums := make([][]tree.Datum, numRows) | ||
for i := 0; i < numRows; i++ { | ||
v := popRow(t) | ||
|
||
updatedRow, err := decoder.DecodeKV( | ||
ctx, roachpb.KeyValue{Key: v.Key, Value: v.Value}, cdcevent.CurrentRow, v.Timestamp(), false) | ||
require.NoError(t, err) | ||
|
||
prevRow, err := decoder.DecodeKV( | ||
ctx, roachpb.KeyValue{Key: v.Key, Value: v.PrevValue}, cdcevent.PrevRow, v.Timestamp(), false) | ||
require.NoError(t, err) | ||
|
||
if writer == nil { | ||
writer, err = NewCDCParquetWriterFromRow(updatedRow, f, maxRowGroupSize) | ||
if err != nil { | ||
t.Fatalf(err.Error()) | ||
} | ||
numCols = len(updatedRow.ResultColumns()) + 1 | ||
} | ||
|
||
datumRow, err := AddData(writer, updatedRow, prevRow, true) | ||
datums[i] = datumRow | ||
require.NoError(t, err) | ||
} | ||
|
||
err = writer.Close() | ||
require.NoError(t, err) | ||
|
||
parquet.ReadFileAndVerifyDatums(t, f.Name(), numRows, numCols, writer, datums) | ||
}) | ||
} | ||
} | ||
|
||
func makeRangefeedReaderAndDecoder( | ||
t *testing.T, s serverutils.TestServerInterface, | ||
) (func(t *testing.T) *kvpb.RangeFeedValue, func(), cdcevent.Decoder) { | ||
tableDesc := cdctest.GetHydratedTableDescriptor(t, s.ExecutorConfig(), "foo") | ||
popRow, cleanup := cdctest.MakeRangeFeedValueReader(t, s.ExecutorConfig(), tableDesc) | ||
targets := changefeedbase.Targets{} | ||
targets.Add(changefeedbase.Target{ | ||
Type: jobspb.ChangefeedTargetSpecification_PRIMARY_FAMILY_ONLY, | ||
TableID: tableDesc.GetID(), | ||
FamilyName: "primary", | ||
}) | ||
sqlExecCfg := s.ExecutorConfig().(sql.ExecutorConfig) | ||
ctx := context.Background() | ||
decoder, err := cdcevent.NewEventDecoder(ctx, &sqlExecCfg, targets, false, false) | ||
require.NoError(t, err) | ||
return popRow, cleanup, decoder | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters