From 0e1801210253669c871ce42a5e51f97fe1143d9f Mon Sep 17 00:00:00 2001 From: Jimmy Lu Date: Wed, 18 Dec 2024 07:05:45 -0800 Subject: [PATCH] feat: Table evolution fuzzer (#11872) Summary: X-link: https://github.com/facebookincubator/nimble/pull/116 Basic version of table evolution fuzzer. Full design of the fuzzer can be found at https://docs.google.com/document/d/18jjNRknSxI99mgdL7eDMzkq65i-mi47Ukymi3JkXzUA/edit Differential Revision: D67283632 --- velox/dwio/common/FileSink.cpp | 4 +- velox/exec/Task.cpp | 8 +- velox/exec/tests/CMakeLists.txt | 15 + velox/exec/tests/HashJoinTest.cpp | 2 +- velox/exec/tests/TableEvolutionFuzzerTest.cpp | 666 ++++++++++++++++++ velox/expression/fuzzer/ArgumentTypeFuzzer.h | 7 +- .../fuzzer/tests/ArgGeneratorTestUtils.cpp | 4 +- .../fuzzer/tests/ArgumentTypeFuzzerTest.cpp | 26 +- .../fuzzer/tests/ExpressionFuzzerUnitTest.cpp | 8 +- .../sparksql/tests/ArgGeneratorTest.cpp | 2 +- velox/vector/fuzzer/Utils.h | 4 +- velox/vector/fuzzer/VectorFuzzer.cpp | 9 +- velox/vector/fuzzer/VectorFuzzer.h | 4 + .../vector/fuzzer/tests/VectorFuzzerTest.cpp | 2 + 14 files changed, 726 insertions(+), 35 deletions(-) create mode 100644 velox/exec/tests/TableEvolutionFuzzerTest.cpp diff --git a/velox/dwio/common/FileSink.cpp b/velox/dwio/common/FileSink.cpp index 35fb65c1b7a3..3dcc44186182 100644 --- a/velox/dwio/common/FileSink.cpp +++ b/velox/dwio/common/FileSink.cpp @@ -157,8 +157,8 @@ LocalFileSink::LocalFileSink( } void LocalFileSink::doClose() { - LOG(INFO) << "closing file: " << name() - << ", total size: " << succinctBytes(size_); + VLOG(1) << "closing file: " << name() + << ", total size: " << succinctBytes(size_); if (writeFile_ != nullptr) { writeFile_->close(); } diff --git a/velox/exec/Task.cpp b/velox/exec/Task.cpp index 2d17153e4137..fe97ad27dc4f 100644 --- a/velox/exec/Task.cpp +++ b/velox/exec/Task.cpp @@ -1244,10 +1244,10 @@ void Task::removeDriver(std::shared_ptr self, Driver* driver) { } if (self->numFinishedDrivers_ == self->numTotalDrivers_) { - LOG(INFO) << "All drivers (" << self->numFinishedDrivers_ - << ") finished for task " << self->taskId() - << " after running for " - << succinctMillis(self->timeSinceStartMsLocked()); + VLOG(1) << "All drivers (" << self->numFinishedDrivers_ + << ") finished for task " << self->taskId() + << " after running for " + << succinctMillis(self->timeSinceStartMsLocked()); } } stateChangeNotifier.notify(); diff --git a/velox/exec/tests/CMakeLists.txt b/velox/exec/tests/CMakeLists.txt index 9683f5008376..adf0252d8e23 100644 --- a/velox/exec/tests/CMakeLists.txt +++ b/velox/exec/tests/CMakeLists.txt @@ -256,6 +256,21 @@ target_link_libraries( GTest::gtest GTest::gtest_main) +add_executable(velox_table_evolution_fuzzer_test TableEvolutionFuzzerTest.cpp) + +target_link_libraries( + velox_table_evolution_fuzzer_test + velox_exec_test_lib + velox_temp_path + velox_vector_fuzzer + GTest::gtest + GTest::gtest_main) + +add_test( + NAME velox_table_evolution_fuzzer_test + COMMAND velox_table_evolution_fuzzer_test + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + add_executable(velox_aggregation_runner_test AggregationRunnerTest.cpp) target_link_libraries( diff --git a/velox/exec/tests/HashJoinTest.cpp b/velox/exec/tests/HashJoinTest.cpp index 0d423a98f214..abe27e052c6c 100644 --- a/velox/exec/tests/HashJoinTest.cpp +++ b/velox/exec/tests/HashJoinTest.cpp @@ -532,7 +532,7 @@ class HashJoinBuilder { if (vectorSize != 0) { fuzzerOpts_.vectorSize = vectorSize; fuzzerOpts_.nullRatio = nullRatio; - VectorFuzzer fuzzer(fuzzerOpts_, &pool_); + VectorFuzzer fuzzer(fuzzerOpts_, &pool_, 42); for (int32_t i = 0; i < numVectors; ++i) { vectors.push_back(fuzzer.fuzzInputRow(rowType)); } diff --git a/velox/exec/tests/TableEvolutionFuzzerTest.cpp b/velox/exec/tests/TableEvolutionFuzzerTest.cpp new file mode 100644 index 000000000000..ff1e408206c2 --- /dev/null +++ b/velox/exec/tests/TableEvolutionFuzzerTest.cpp @@ -0,0 +1,666 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/connectors/hive/HiveConnector.h" +#include "velox/connectors/hive/HiveConnectorSplit.h" +#include "velox/dwio/dwrf/RegisterDwrfReader.h" +#include "velox/dwio/dwrf/RegisterDwrfWriter.h" +#include "velox/exec/Cursor.h" +#include "velox/exec/tests/utils/PlanBuilder.h" +#include "velox/exec/tests/utils/TempDirectoryPath.h" +#include "velox/vector/fuzzer/VectorFuzzer.h" + +#include +#include +#include + +#include + +DEFINE_uint32(seed, 0, ""); +DEFINE_int32(duration_sec, 30, ""); +DEFINE_int32(column_count, 5, ""); +DEFINE_int32(evolution_count, 5, ""); + +namespace facebook::velox::exec::test { + +namespace { + +constexpr int kVectorSize = 101; + +VectorFuzzer::Options makeVectorFuzzerOptions() { + VectorFuzzer::Options options; + options.vectorSize = kVectorSize; + options.allowSlice = false; + return options; +} + +} // namespace + +class TableEvolutionFuzzer { + public: + struct Config { + int columnCount; + int evolutionCount; + std::vector formats; + memory::MemoryPool* pool; + }; + + static const std::string& connectorId() { + static const std::string connectorId(PlanBuilder::kHiveDefaultConnectorId); + return connectorId; + } + + explicit TableEvolutionFuzzer(const Config& config) + : config_(config), vectorFuzzer_(makeVectorFuzzerOptions(), config.pool) { + VELOX_CHECK_GT(config_.columnCount, 0); + VELOX_CHECK_GT(config_.evolutionCount, 1); + VELOX_CHECK(!config_.formats.empty()); + } + + unsigned seed() const { + return currentSeed_; + } + + void setSeed(unsigned seed) { + currentSeed_ = seed; + rng_.seed(seed); + vectorFuzzer_.reSeed(rng_()); + } + + void reSeed() { + setSeed(rng_()); + } + + void run(); + + private: + struct Setup { + // Potentially with different field names, widened types, and additional + // fields compared to previous setup. + RowTypePtr schema; + + // New bucket count, must be a multiple of the bucket count in previous + // setup. + int log2BucketCount; + + dwio::common::FileFormat fileFormat; + + int bucketCount() const { + return 1 << log2BucketCount; + } + }; + + friend std::ostream& operator<<( + std::ostream& out, + const TableEvolutionFuzzer::Setup& setup); + + std::string makeNewName() { + return fmt::format("name_{}", ++sequenceNumber_); + } + + TypePtr makeNewType(int maxDepth) { + // All types that can be written to file directly. + static const std::vector scalarTypes = { + BOOLEAN(), + TINYINT(), + SMALLINT(), + INTEGER(), + BIGINT(), + REAL(), + DOUBLE(), + VARCHAR(), + VARBINARY(), + }; + return vectorFuzzer_.randType(scalarTypes, maxDepth); + } + + RowTypePtr makeInitialSchema() { + std::vector names(config_.columnCount); + std::vector types(config_.columnCount); + for (int i = 0; i < config_.columnCount; ++i) { + names[i] = makeNewName(); + types[i] = makeNewType(3); + } + return ROW(std::move(names), std::move(types)); + } + + TypePtr evolveType(const TypePtr& old) { + switch (old->kind()) { + case TypeKind::ARRAY: + return ARRAY(evolveType(old->asArray().elementType())); + case TypeKind::MAP: { + auto& mapType = old->asMap(); + return MAP( + evolveType(mapType.keyType()), evolveType(mapType.valueType())); + } + case TypeKind::ROW: + return evolveRowType(old->asRow(), {}); + default: + if (!folly::Random::oneIn(4, rng_)) { + return old; + } + } + + switch (old->kind()) { + case TypeKind::TINYINT: + return SMALLINT(); + case TypeKind::SMALLINT: + return INTEGER(); + case TypeKind::INTEGER: + return BIGINT(); + case TypeKind::REAL: + return DOUBLE(); + default: + return old; + } + } + + RowTypePtr evolveRowType( + const RowType& old, + const std::vector& bucketColumnIndices) { + auto names = old.names(); + auto types = old.children(); + for (int i = 0, j = 0; i < old.size(); ++i) { + // Skip evolving bucket column. + while (j < bucketColumnIndices.size() && bucketColumnIndices[j] < i) { + ++j; + } + if (j < bucketColumnIndices.size() && bucketColumnIndices[j] == i) { + continue; + } + if (folly::Random::oneIn(4, rng_)) { + names[i] = makeNewName(); + } + types[i] = evolveType(types[i]); + } + if (folly::Random::oneIn(4, rng_)) { + names.push_back(makeNewName()); + types.push_back(makeNewType(2)); + } + return ROW(std::move(names), std::move(types)); + } + + std::vector makeSetups( + const std::vector& bucketColumnIndices) { + std::vector setups(config_.evolutionCount); + for (int i = 0; i < config_.evolutionCount; ++i) { + if (i == 0) { + setups[i].schema = makeInitialSchema(); + } else { + setups[i].schema = + evolveRowType(*setups[i - 1].schema, bucketColumnIndices); + } + if (!bucketColumnIndices.empty()) { + if (i == 0) { + setups[i].log2BucketCount = folly::Random::rand32(1, 4, rng_); + } else { + setups[i].log2BucketCount = std::min( + 8, + setups[i - 1].log2BucketCount + folly::Random::rand32(3, rng_)); + } + } else { + setups[i].log2BucketCount = 0; + } + setups[i].fileFormat = + config_.formats[folly::Random::rand32(config_.formats.size(), rng_)]; + VLOG(1) << "Setup " << i << ": " << setups[i]; + } + return setups; + } + + static std::unique_ptr makeWriteTask( + const Setup& setup, + const RowVectorPtr& data, + const std::string& outputDir, + const std::vector& bucketColumnIndices) { + auto builder = PlanBuilder().values({data}); + if (bucketColumnIndices.empty()) { + builder.tableWrite(outputDir, setup.fileFormat); + } else { + std::vector bucketColumnNames; + bucketColumnNames.reserve(bucketColumnIndices.size()); + for (auto i : bucketColumnIndices) { + bucketColumnNames.push_back(setup.schema->nameOf(i)); + } + builder.tableWrite( + outputDir, + /*partitionBy=*/{}, + setup.bucketCount(), + bucketColumnNames, + setup.fileFormat); + } + CursorParameters params; + params.serialExecution = true; + params.planNode = builder.planNode(); + return TaskCursor::create(params); + } + + template + VectorPtr liftToPrimitiveType( + const FlatVector& input, + const TypePtr& type) { + auto targetBuffer = AlignedBuffer::allocate(input.size(), config_.pool); + auto* rawTargetValues = targetBuffer->template asMutable(); + auto* rawSourceValues = input.rawValues(); + for (vector_size_t i = 0; i < input.size(); ++i) { + rawTargetValues[i] = rawSourceValues[i]; + } + return std::make_shared>( + config_.pool, + type, + input.nulls(), + input.size(), + std::move(targetBuffer), + std::vector({})); + } + + VectorPtr liftToType(const VectorPtr& input, const TypePtr& type); + + std::unique_ptr makeScanTask( + const RowTypePtr& tableSchema, + std::vector splits) { + CursorParameters params; + params.serialExecution = true; + // TODO: Mix in filter and aggregate pushdowns. + params.planNode = PlanBuilder() + .tableScan( + tableSchema, + /*subfieldFilters=*/{}, + /*remainingFilter=*/"", + tableSchema) + .planNode(); + auto cursor = TaskCursor::create(params); + for (auto& split : splits) { + cursor->task()->addSplit("0", std::move(split)); + } + cursor->task()->noMoreSplits("0"); + return cursor; + } + + const Config config_; + VectorFuzzer vectorFuzzer_; + unsigned currentSeed_; + FuzzerGenerator rng_; + int64_t sequenceNumber_ = 0; +}; + +namespace { + +std::vector> runTaskCursors( + const std::vector>& cursors, + folly::Executor& executor) { + std::vector>> futures; + for (int i = 0; i < cursors.size(); ++i) { + auto [promise, future] = + folly::makePromiseContract>(); + futures.push_back(std::move(future)); + executor.add([&, i, promise = std::move(promise)]() mutable { + std::vector results; + try { + while (cursors[i]->moveNext()) { + auto& result = cursors[i]->current(); + result->loadedVector(); + results.push_back(std::move(result)); + } + promise.setValue(std::move(results)); + } catch (const std::exception& e) { + LOG(ERROR) << e.what(); + promise.setException(e); + } + }); + } + std::vector> results; + constexpr std::chrono::seconds kTaskTimeout(10); + for (auto& future : futures) { + results.push_back(std::move(future).get(kTaskTimeout)); + } + return results; +} + +// `tableBucketCount' is the bucket count of current table setup when reading. +// `partitionBucketCount' is the bucket count when the partition was written. +// `tableBucketCount' must be a multiple of `partitionBucketCount'. +void buildScanSplitFromTableWriteResult( + const RowTypePtr& tableSchema, + const std::vector& bucketColumnIndices, + std::optional tableBucket, + int tableBucketCount, + int partitionBucketCount, + dwio::common::FileFormat fileFormat, + const std::vector& writeResult, + std::vector& splits) { + VELOX_CHECK_EQ(writeResult.size(), 1); + auto* fragments = + writeResult[0]->childAt(1)->asChecked>(); + for (int i = 1; i < writeResult[0]->size(); ++i) { + auto fragment = folly::parseJson(fragments->valueAt(i)); + auto fileName = fragment["fileWriteInfos"][0]["writeFileName"].asString(); + auto hiveSplit = std::make_shared( + TableEvolutionFuzzer::connectorId(), + fmt::format("{}/{}", fragment["writePath"].asString(), fileName), + fileFormat); + if (!tableBucket.has_value()) { + splits.emplace_back(std::move(hiveSplit)); + continue; + } + + auto fileBucketEnd = fileName.find('_'); + VELOX_CHECK_NE(fileBucketEnd, fileName.npos); + auto fileBucket = folly::to(fileName.substr(0, fileBucketEnd)); + if (*tableBucket % partitionBucketCount != fileBucket) { + continue; + } + hiveSplit->tableBucketNumber = tableBucket; + if (partitionBucketCount != tableBucketCount) { + auto& bucketConversion = hiveSplit->bucketConversion.emplace(); + bucketConversion.tableBucketCount = tableBucketCount; + bucketConversion.partitionBucketCount = partitionBucketCount; + for (auto bucketColumnIndex : bucketColumnIndices) { + auto handle = std::make_unique( + tableSchema->nameOf(bucketColumnIndex), + connector::hive::HiveColumnHandle::ColumnType::kRegular, + tableSchema->childAt(bucketColumnIndex), + tableSchema->childAt(bucketColumnIndex)); + bucketConversion.bucketColumnHandles.push_back(std::move(handle)); + } + } + splits.emplace_back(std::move(hiveSplit)); + } +} + +void checkResultsEqual( + const std::vector& actual, + const std::vector& expected) { + int actualVectorIndex = 0; + int expectedVectorIndex = 0; + int actualRowIndex = 0, expectedRowIndex = 0; + while (actualVectorIndex < actual.size() && + expectedVectorIndex < expected.size()) { + if (actualRowIndex == actual[actualVectorIndex]->size()) { + ++actualVectorIndex; + actualRowIndex = 0; + continue; + } + if (expectedRowIndex == expected[expectedVectorIndex]->size()) { + ++expectedVectorIndex; + expectedRowIndex = 0; + continue; + } + VELOX_CHECK(actual[actualVectorIndex]->equalValueAt( + expected[expectedVectorIndex].get(), actualRowIndex, expectedRowIndex)); + ++actualRowIndex; + ++expectedRowIndex; + } + if (actualVectorIndex < actual.size() && + actualRowIndex == actual[actualVectorIndex]->size()) { + ++actualVectorIndex; + actualRowIndex = 0; + } + if (expectedVectorIndex < expected.size() && + expectedRowIndex == expected[expectedVectorIndex]->size()) { + ++expectedVectorIndex; + expectedRowIndex = 0; + } + VELOX_CHECK_EQ(actualVectorIndex, actual.size()); + VELOX_CHECK_EQ(expectedVectorIndex, expected.size()); +} + +} // namespace + +std::ostream& operator<<( + std::ostream& out, + const TableEvolutionFuzzer::Setup& setup) { + out << "schema=" << setup.schema->toString() + << " log2BucketCount=" << setup.log2BucketCount + << " fileFormat=" << setup.fileFormat; + return out; +} + +VectorPtr TableEvolutionFuzzer::liftToType( + const VectorPtr& input, + const TypePtr& type) { + switch (input->typeKind()) { + case TypeKind::TINYINT: { + auto* typed = input->asChecked>(); + switch (type->kind()) { + case TypeKind::TINYINT: + return input; + case TypeKind::SMALLINT: + return liftToPrimitiveType(*typed, type); + case TypeKind::INTEGER: + return liftToPrimitiveType(*typed, type); + case TypeKind::BIGINT: + return liftToPrimitiveType(*typed, type); + default: + VELOX_UNREACHABLE(); + } + } + case TypeKind::SMALLINT: { + auto* typed = input->asChecked>(); + switch (type->kind()) { + case TypeKind::SMALLINT: + return input; + case TypeKind::INTEGER: + return liftToPrimitiveType(*typed, type); + case TypeKind::BIGINT: + return liftToPrimitiveType(*typed, type); + default: + VELOX_UNREACHABLE(); + } + } + case TypeKind::INTEGER: { + auto* typed = input->asChecked>(); + switch (type->kind()) { + case TypeKind::INTEGER: + return input; + case TypeKind::BIGINT: + return liftToPrimitiveType(*typed, type); + default: + VELOX_UNREACHABLE(); + } + } + case TypeKind::REAL: { + auto* typed = input->asChecked>(); + switch (type->kind()) { + case TypeKind::REAL: + return input; + case TypeKind::DOUBLE: + return liftToPrimitiveType(*typed, type); + default: + VELOX_UNREACHABLE(); + } + } + case TypeKind::ARRAY: { + VELOX_CHECK_EQ(type->kind(), TypeKind::ARRAY); + auto* array = input->asChecked(); + return std::make_shared( + config_.pool, + type, + array->nulls(), + array->size(), + array->offsets(), + array->sizes(), + liftToType(array->elements(), type->asArray().elementType())); + } + case TypeKind::MAP: { + VELOX_CHECK_EQ(type->kind(), TypeKind::MAP); + auto& mapType = type->asMap(); + auto* map = input->asChecked(); + return std::make_shared( + config_.pool, + type, + map->nulls(), + map->size(), + map->offsets(), + map->sizes(), + liftToType(map->mapKeys(), mapType.keyType()), + liftToType(map->mapValues(), mapType.valueType())); + } + case TypeKind::ROW: { + VELOX_CHECK_EQ(type->kind(), TypeKind::ROW); + auto& rowType = type->asRow(); + auto* row = input->asChecked(); + auto children = row->children(); + for (int i = 0; i < rowType.size(); ++i) { + auto& childType = rowType.childAt(i); + if (i < children.size()) { + children[i] = liftToType(children[i], childType); + } else { + children.push_back(BaseVector::createNullConstant( + childType, row->size(), config_.pool)); + } + } + return std::make_shared( + config_.pool, type, row->nulls(), row->size(), std::move(children)); + } + default: + return input; + } +} + +void TableEvolutionFuzzer::run() { + std::vector bucketColumnIndices; + for (int i = 0; i < config_.columnCount; ++i) { + if (folly::Random::oneIn(2 * config_.columnCount, rng_)) { + bucketColumnIndices.push_back(i); + } + } + VLOG(1) << "bucketColumnIndices: [" << folly::join(", ", bucketColumnIndices) + << "]"; + auto testSetups = makeSetups(bucketColumnIndices); + auto tableOutputRootDir = TempDirectoryPath::create(); + std::vector> writeTasks( + 2 * config_.evolutionCount - 1); + for (int i = 0; i < config_.evolutionCount; ++i) { + auto data = vectorFuzzer_.fuzzRow(testSetups[i].schema, kVectorSize, false); + for (auto& child : data->children()) { + BaseVector::flattenVector(child); + } + auto actualDir = + fmt::format("{}/actual_{}", tableOutputRootDir->getPath(), i); + VELOX_CHECK(std::filesystem::create_directory(actualDir)); + writeTasks[2 * i] = + makeWriteTask(testSetups[i], data, actualDir, bucketColumnIndices); + if (i == config_.evolutionCount - 1) { + continue; + } + auto expectedDir = + fmt::format("{}/expected_{}", tableOutputRootDir->getPath(), i); + VELOX_CHECK(std::filesystem::create_directory(expectedDir)); + auto expectedData = std::static_pointer_cast( + liftToType(data, testSetups.back().schema)); + writeTasks[2 * i + 1] = makeWriteTask( + testSetups.back(), expectedData, expectedDir, bucketColumnIndices); + } + auto executor = folly::getGlobalCPUExecutor(); + auto writeResults = runTaskCursors(writeTasks, *executor); + + std::optional selectedBucket; + if (!bucketColumnIndices.empty()) { + selectedBucket = + folly::Random::rand32(testSetups.back().bucketCount(), rng_); + VLOG(1) << "selectedBucket=" << *selectedBucket; + } + + std::vector actualSplits, expectedSplits; + for (int i = 0; i < config_.evolutionCount; ++i) { + auto* result = &writeResults[2 * i]; + buildScanSplitFromTableWriteResult( + testSetups.back().schema, + bucketColumnIndices, + selectedBucket, + testSetups.back().bucketCount(), + testSetups[i].bucketCount(), + testSetups[i].fileFormat, + *result, + actualSplits); + if (i < config_.evolutionCount - 1) { + result = &writeResults[2 * i + 1]; + } + buildScanSplitFromTableWriteResult( + testSetups.back().schema, + bucketColumnIndices, + selectedBucket, + testSetups.back().bucketCount(), + testSetups.back().bucketCount(), + testSetups.back().fileFormat, + *result, + expectedSplits); + } + std::vector> scanTasks(2); + scanTasks[0] = + makeScanTask(testSetups.back().schema, std::move(actualSplits)); + scanTasks[1] = + makeScanTask(testSetups.back().schema, std::move(expectedSplits)); + auto scanResults = runTaskCursors(scanTasks, *executor); + checkResultsEqual(scanResults[0], scanResults[1]); +} + +namespace { + +void registerFactories(folly::Executor* ioExecutor) { + filesystems::registerLocalFileSystem(); + connector::registerConnectorFactory( + std::make_shared()); + auto hiveConnector = + connector::getConnectorFactory( + connector::hive::HiveConnectorFactory::kHiveConnectorName) + ->newConnector( + TableEvolutionFuzzer::connectorId(), + std::make_shared( + std::unordered_map()), + ioExecutor); + connector::registerConnector(hiveConnector); + dwio::common::registerFileSinks(); + dwrf::registerDwrfReaderFactory(); + dwrf::registerDwrfWriterFactory(); +} + +TEST(TableEvolutionFuzzerTest, run) { + auto pool = memory::memoryManager()->addLeafPool("TableEvolutionFuzzer"); + exec::test::TableEvolutionFuzzer::Config config; + config.pool = pool.get(); + config.columnCount = FLAGS_column_count; + config.evolutionCount = FLAGS_evolution_count; + config.formats = {dwio::common::FileFormat::DWRF}; + exec::test::TableEvolutionFuzzer fuzzer(config); + fuzzer.setSeed(FLAGS_seed); + const auto startTime = std::chrono::system_clock::now(); + for (int i = 0; std::chrono::system_clock::now() - startTime < + std::chrono::seconds(FLAGS_duration_sec); + ++i) { + LOG(INFO) << "Starting iteration " << i << ", seed=" << fuzzer.seed(); + fuzzer.run(); + fuzzer.reSeed(); + } +} + +} // namespace + +} // namespace facebook::velox::exec::test + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + folly::Init init(&argc, &argv); + if (gflags::GetCommandLineFlagInfoOrDie("seed").is_default) { + FLAGS_seed = std::random_device{}(); + LOG(INFO) << "Use generated random seed " << FLAGS_seed; + } + facebook::velox::memory::MemoryManager::initialize({}); + auto ioExecutor = folly::getGlobalIOExecutor(); + facebook::velox::exec::test::registerFactories(ioExecutor.get()); + return RUN_ALL_TESTS(); +} diff --git a/velox/expression/fuzzer/ArgumentTypeFuzzer.h b/velox/expression/fuzzer/ArgumentTypeFuzzer.h index 08c3375df9f8..b9323971bd7c 100644 --- a/velox/expression/fuzzer/ArgumentTypeFuzzer.h +++ b/velox/expression/fuzzer/ArgumentTypeFuzzer.h @@ -21,6 +21,7 @@ #include "velox/expression/FunctionSignature.h" #include "velox/expression/SignatureBinder.h" #include "velox/type/Type.h" +#include "velox/vector/fuzzer/Utils.h" namespace facebook::velox::fuzzer { @@ -33,13 +34,13 @@ class ArgumentTypeFuzzer { public: ArgumentTypeFuzzer( const exec::FunctionSignature& signature, - std::mt19937& rng) + FuzzerGenerator& rng) : ArgumentTypeFuzzer(signature, nullptr, rng) {} ArgumentTypeFuzzer( const exec::FunctionSignature& signature, const TypePtr& returnType, - std::mt19937& rng) + FuzzerGenerator& rng) : signature_{signature}, returnType_{returnType}, rng_{rng} {} /// Generate random argument types. If the desired returnType has been @@ -110,7 +111,7 @@ class ArgumentTypeFuzzer { std::unordered_map integerBindings_; /// RNG to generate random types for unbounded type variables when necessary. - std::mt19937& rng_; + FuzzerGenerator& rng_; }; /// Return the kind name of type in lower case. This is expected to match the diff --git a/velox/expression/fuzzer/tests/ArgGeneratorTestUtils.cpp b/velox/expression/fuzzer/tests/ArgGeneratorTestUtils.cpp index 7dd10d3adf33..80cf682b6012 100644 --- a/velox/expression/fuzzer/tests/ArgGeneratorTestUtils.cpp +++ b/velox/expression/fuzzer/tests/ArgGeneratorTestUtils.cpp @@ -23,7 +23,7 @@ void assertReturnType( const std::shared_ptr& generator, const exec::FunctionSignature& signature, const TypePtr& returnType) { - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; const auto argTypes = generator->generateArgs(signature, returnType, seed); // Resolve return type from argument types for the given signature. @@ -42,7 +42,7 @@ void assertEmptyArgs( std::shared_ptr generator, const exec::FunctionSignature& signature, const TypePtr& returnType) { - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; const auto argTypes = generator->generateArgs(signature, returnType, seed); EXPECT_TRUE(argTypes.empty()); } diff --git a/velox/expression/fuzzer/tests/ArgumentTypeFuzzerTest.cpp b/velox/expression/fuzzer/tests/ArgumentTypeFuzzerTest.cpp index ca4ee13b64d5..53883a033989 100644 --- a/velox/expression/fuzzer/tests/ArgumentTypeFuzzerTest.cpp +++ b/velox/expression/fuzzer/tests/ArgumentTypeFuzzerTest.cpp @@ -33,7 +33,7 @@ class ArgumentTypeFuzzerTest : public testing::Test { const std::shared_ptr& signature, const TypePtr& returnType, const std::vector& expectedArgumentTypes) { - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; ArgumentTypeFuzzer fuzzer{*signature, returnType, seed}; ASSERT_TRUE(fuzzer.fuzzArgumentTypes(kMaxVariadicArgs)); @@ -62,7 +62,7 @@ class ArgumentTypeFuzzerTest : public testing::Test { void testFuzzingFailure( const std::shared_ptr& signature, const TypePtr& returnType) { - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; ArgumentTypeFuzzer fuzzer{*signature, returnType, seed}; ASSERT_FALSE(fuzzer.fuzzArgumentTypes(kMaxVariadicArgs)); } @@ -136,7 +136,7 @@ TEST_F(ArgumentTypeFuzzerTest, signatureTemplate) { auto verifyArgumentTypes = [&](const TypePtr& returnType, const TypePtr& firstArg) { - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; ArgumentTypeFuzzer fuzzer{*signature, returnType, seed}; ASSERT_TRUE(fuzzer.fuzzArgumentTypes(kMaxVariadicArgs)); @@ -161,7 +161,7 @@ TEST_F(ArgumentTypeFuzzerTest, signatureTemplate) { .build(); { - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; ArgumentTypeFuzzer fuzzer{*signature, BIGINT(), seed}; ASSERT_TRUE(fuzzer.fuzzArgumentTypes(kMaxVariadicArgs)); @@ -194,7 +194,7 @@ TEST_F(ArgumentTypeFuzzerTest, variableArity) { .returnType("bigint") .variableArity("K") .build(); - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; ArgumentTypeFuzzer fuzzer{*signature, BIGINT(), seed}; ASSERT_TRUE(fuzzer.fuzzArgumentTypes(kMaxVariadicArgs)); @@ -211,7 +211,7 @@ TEST_F(ArgumentTypeFuzzerTest, any) { .returnType("bigint") .argumentType("any") .build(); - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; ArgumentTypeFuzzer fuzzer{*signature, BIGINT(), seed}; ASSERT_TRUE(fuzzer.fuzzArgumentTypes(kMaxVariadicArgs)); @@ -279,7 +279,7 @@ TEST_F(ArgumentTypeFuzzerTest, lambda) { .build(); { - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; ArgumentTypeFuzzer fuzzer{*signature, ARRAY(VARCHAR()), seed}; ASSERT_TRUE(fuzzer.fuzzArgumentTypes(kMaxVariadicArgs)); @@ -302,7 +302,7 @@ TEST_F(ArgumentTypeFuzzerTest, lambda) { .build(); { - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; ArgumentTypeFuzzer fuzzer{*signature, MAP(BIGINT(), VARCHAR()), seed}; ASSERT_TRUE(fuzzer.fuzzArgumentTypes(kMaxVariadicArgs)); @@ -325,7 +325,7 @@ TEST_F(ArgumentTypeFuzzerTest, unconstrainedSignatureTemplate) { .argumentType("K") .build(); - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; ArgumentTypeFuzzer fuzzer{*signature, MAP(BIGINT(), VARCHAR()), seed}; ASSERT_TRUE(fuzzer.fuzzArgumentTypes(kMaxVariadicArgs)); @@ -336,7 +336,7 @@ TEST_F(ArgumentTypeFuzzerTest, unconstrainedSignatureTemplate) { ASSERT_EQ(argumentTypes[0]->kind(), TypeKind::MAP); - ASSERT_EQ(argumentTypes[0]->childAt(0), argumentTypes[1]); + ASSERT_EQ(*argumentTypes[0]->childAt(0), *argumentTypes[1]); } TEST_F(ArgumentTypeFuzzerTest, orderableConstraint) { @@ -348,7 +348,7 @@ TEST_F(ArgumentTypeFuzzerTest, orderableConstraint) { .build(); for (size_t i = 0; i < 100; ++i) { - std::mt19937 rng(i); + FuzzerGenerator rng(i); ArgumentTypeFuzzer fuzzer{*signature, nullptr, rng}; fuzzer.fuzzArgumentTypes(kMaxVariadicArgs); ASSERT_TRUE(fuzzer.argumentTypes()[0]->isOrderable()) @@ -419,7 +419,7 @@ TEST_F(ArgumentTypeFuzzerTest, orderableConstraint) { TEST_F(ArgumentTypeFuzzerTest, fuzzDecimalArgumentTypes) { auto fuzzArgumentTypes = [](const exec::FunctionSignature& signature, const TypePtr& returnType) { - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; ArgumentTypeFuzzer fuzzer{signature, returnType, seed}; bool ok = fuzzer.fuzzArgumentTypes(kMaxVariadicArgs); VELOX_CHECK( @@ -596,7 +596,7 @@ TEST_F(ArgumentTypeFuzzerTest, fuzzDecimalArgumentTypes) { TEST_F(ArgumentTypeFuzzerTest, fuzzDecimalReturnType) { auto fuzzReturnType = [](const exec::FunctionSignature& signature) { - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; ArgumentTypeFuzzer fuzzer{signature, seed}; return fuzzer.fuzzReturnType(); }; diff --git a/velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp b/velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp index 2a0093133687..b619cd984452 100644 --- a/velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp +++ b/velox/expression/fuzzer/tests/ExpressionFuzzerUnitTest.cpp @@ -39,7 +39,7 @@ class ExpressionFuzzerUnitTest : public testing::Test { return maxLevelOfNesting; } - TypePtr randomType(std::mt19937& seed) { + TypePtr randomType(FuzzerGenerator& seed) { static std::vector kSupportedTypes{ BOOLEAN(), TINYINT(), @@ -70,7 +70,7 @@ auto makeOptionsWithMaxLevelNesting(int32_t value) { } // namespace TEST_F(ExpressionFuzzerUnitTest, restrictedLevelOfNesting) { velox::functions::prestosql::registerAllScalarFunctions(); - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; auto testLevelOfNesting = [&](int32_t maxLevelOfNesting) { ExpressionFuzzer fuzzer{ @@ -111,7 +111,7 @@ TEST_F(ExpressionFuzzerUnitTest, reproduceExpressionWithSeed) { // the same. auto generateExpressions = [&]() { std::vector firstGeneration; - std::mt19937 seed{7654321}; + FuzzerGenerator seed{7654321}; ExpressionFuzzer fuzzer{ velox::getFunctionSignatures(), 1234567, @@ -135,7 +135,7 @@ TEST_F(ExpressionFuzzerUnitTest, reproduceExpressionWithSeed) { TEST_F(ExpressionFuzzerUnitTest, exprBank) { velox::functions::prestosql::registerAllScalarFunctions(); - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; int32_t maxLevelOfNesting = 10; { ExpressionFuzzer fuzzer{ diff --git a/velox/functions/sparksql/tests/ArgGeneratorTest.cpp b/velox/functions/sparksql/tests/ArgGeneratorTest.cpp index e9f9b0279a59..286474bd9337 100644 --- a/velox/functions/sparksql/tests/ArgGeneratorTest.cpp +++ b/velox/functions/sparksql/tests/ArgGeneratorTest.cpp @@ -34,7 +34,7 @@ class ArgGeneratorTest : public SparkFunctionBaseTest { const exec::FunctionSignature& signature, const TypePtr& returnType, std::function&)> check) { - std::mt19937 seed{0}; + FuzzerGenerator seed{0}; const auto argTypes = generator->generateArgs(signature, returnType, seed); check(argTypes); } diff --git a/velox/vector/fuzzer/Utils.h b/velox/vector/fuzzer/Utils.h index 0248b08f942f..e0418ea3b16e 100644 --- a/velox/vector/fuzzer/Utils.h +++ b/velox/vector/fuzzer/Utils.h @@ -19,9 +19,11 @@ #include "velox/vector/BaseVector.h" #include "velox/vector/NullsBuilder.h" +#include + namespace facebook::velox { -using FuzzerGenerator = std::mt19937; +using FuzzerGenerator = folly::detail::DefaultGenerator; enum class FuzzerTimestampPrecision : int8_t { kNanoSeconds = 0, diff --git a/velox/vector/fuzzer/VectorFuzzer.cpp b/velox/vector/fuzzer/VectorFuzzer.cpp index c7fd600bf015..a56785c227a0 100644 --- a/velox/vector/fuzzer/VectorFuzzer.cpp +++ b/velox/vector/fuzzer/VectorFuzzer.cpp @@ -407,13 +407,13 @@ VectorPtr VectorFuzzer::fuzz(const TypePtr& type, vector_size_t size) { bool usingLazyVector = opts_.allowLazyVector && coinToss(0.1); // Lazy Vectors cannot be sliced, so we skip this if using lazy wrapping. - if (!usingLazyVector && coinToss(0.1)) { + if (opts_.allowSlice && !usingLazyVector && coinToss(0.1)) { // Extend the underlying vector to allow slicing later. vectorSize += rand(rng_) % 8; } // 20% chance of adding a constant vector. - if (coinToss(0.2)) { + if (opts_.allowConstantVector && coinToss(0.2)) { vector = fuzzConstant(type, vectorSize); } else if (type->isPrimitiveType()) { vector = fuzzFlatPrimitive(type, vectorSize); @@ -433,9 +433,10 @@ VectorPtr VectorFuzzer::fuzz(const TypePtr& type, vector_size_t size) { } // Toss a coin and add dictionary indirections. - while (coinToss(0.5)) { + while (opts_.allowDictionaryVector && coinToss(0.5)) { vectorSize = size; - if (!usingLazyVector && vectorSize > 0 && coinToss(0.05)) { + if (opts_.allowSlice && !usingLazyVector && vectorSize > 0 && + coinToss(0.05)) { vectorSize += rand(rng_) % 8; } vector = fuzzDictionary(vector, vectorSize); diff --git a/velox/vector/fuzzer/VectorFuzzer.h b/velox/vector/fuzzer/VectorFuzzer.h index 00a01527e234..8a3fae2c614d 100644 --- a/velox/vector/fuzzer/VectorFuzzer.h +++ b/velox/vector/fuzzer/VectorFuzzer.h @@ -149,6 +149,10 @@ class VectorFuzzer { /// dictionary layers on top of them. bool allowLazyVector{false}; + bool allowSlice{true}; + bool allowConstantVector{true}; + bool allowDictionaryVector{true}; + /// Data spec for randomly generated data. DataSpec dataSpec{false, false}; }; diff --git a/velox/vector/fuzzer/tests/VectorFuzzerTest.cpp b/velox/vector/fuzzer/tests/VectorFuzzerTest.cpp index 56526f528acd..4e2595cc9bd1 100644 --- a/velox/vector/fuzzer/tests/VectorFuzzerTest.cpp +++ b/velox/vector/fuzzer/tests/VectorFuzzerTest.cpp @@ -499,6 +499,8 @@ TEST_F(VectorFuzzerTest, containerHasNulls) { opts.nullRatio = 0.5; opts.normalizeMapKeys = false; opts.containerHasNulls = true; + opts.allowDictionaryVector = false; + opts.allowConstantVector = false; { VectorFuzzer fuzzer(opts, pool());