Skip to content

Commit

Permalink
GDV-108 : switch to a more efficient date impl (apache#91)
Browse files Browse the repository at this point in the history
- reduce benchmark iterations to 1M instead of 100M
- add checks in benchmark test to verify that elapsed_time is
  atleast <= 2 * expected
  • Loading branch information
pravindra authored and praveenbingo committed Sep 10, 2018
1 parent e1d1630 commit a0e2708
Show file tree
Hide file tree
Showing 10 changed files with 7,044 additions and 229 deletions.
29 changes: 29 additions & 0 deletions cpp/src/gandiva/integ/generate_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
// limitations under the License.

#include <stdlib.h>
#include <string>

#ifndef GANDIVA_GENERATE_DATA_H
#define GANDIVA_GENERATE_DATA_H
Expand Down Expand Up @@ -56,6 +57,34 @@ class Int64DataGenerator : public DataGenerator<int64_t> {
unsigned int seed_;
};

class FastUtf8DataGenerator : public DataGenerator<std::string> {
public:
FastUtf8DataGenerator(int max_len) : seed_(100), max_len_(max_len), cur_char_('a') {}

std::string GenerateData() {
std::string generated_str;

int slen = rand_r(&seed_) % max_len_;
for (int i = 0; i < slen; ++i) {
generated_str += generate_next_char();
}
return generated_str;
}

private:
char generate_next_char() {
++cur_char_;
if (cur_char_ > 'z') {
cur_char_ = 'a';
}
return cur_char_;
}

unsigned int seed_;
unsigned int max_len_;
char cur_char_;
};

} // namespace gandiva

#endif // GANDIVA_GENERATE_DATA_H
106 changes: 102 additions & 4 deletions cpp/src/gandiva/integ/micro_benchmarks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ namespace gandiva {
using arrow::boolean;
using arrow::int32;
using arrow::int64;
using arrow::utf8;

float tolerance_ratio = 2.0;

class TestBenchmarks : public ::testing::Test {
public:
Expand Down Expand Up @@ -59,11 +62,14 @@ TEST_F(TestBenchmarks, TimedTestAdd3) {

int64_t elapsed_millis;
Int64DataGenerator data_generator;
status = TimedEvaluate<arrow::Int64Type, int64_t>(schema, projector, data_generator,
pool_, 100 * MILLION, 16 * THOUSAND,
ProjectEvaluator evaluator(projector);

status = TimedEvaluate<arrow::Int64Type, int64_t>(schema, evaluator, data_generator,
pool_, 1 * MILLION, 16 * THOUSAND,
elapsed_millis);
ASSERT_TRUE(status.ok());
std::cout << "Time taken for Add3 " << elapsed_millis << " ms\n";
EXPECT_LE(elapsed_millis, 2 * tolerance_ratio);
}

TEST_F(TestBenchmarks, TimedTestBigNested) {
Expand Down Expand Up @@ -103,11 +109,103 @@ TEST_F(TestBenchmarks, TimedTestBigNested) {

int64_t elapsed_millis;
BoundedInt32DataGenerator data_generator(250);
status = TimedEvaluate<arrow::Int32Type, int32_t>(schema, projector, data_generator,
pool_, 100 * MILLION, 16 * THOUSAND,
ProjectEvaluator evaluator(projector);

status = TimedEvaluate<arrow::Int32Type, int32_t>(schema, evaluator, data_generator,
pool_, 1 * MILLION, 16 * THOUSAND,
elapsed_millis);
ASSERT_TRUE(status.ok());
std::cout << "Time taken for BigNestedIf " << elapsed_millis << " ms\n";

EXPECT_LE(elapsed_millis, 12 * tolerance_ratio);
}

TEST_F(TestBenchmarks, TimedTestExtractYear) {
// schema for input fields
auto field0 = field("f0", arrow::date64());
auto schema = arrow::schema({field0});

// output field
auto field_res = field("res", int64());

// Build expression
auto expr = TreeExprBuilder::MakeExpression("extractYear", {field0}, field_res);

std::shared_ptr<Projector> projector;
Status status = Projector::Make(schema, {expr}, &projector);
EXPECT_TRUE(status.ok());

int64_t elapsed_millis;
Int64DataGenerator data_generator;
ProjectEvaluator evaluator(projector);

status = TimedEvaluate<arrow::Date64Type, int64_t>(schema, evaluator, data_generator,
pool_, 1 * MILLION, 16 * THOUSAND,
elapsed_millis);
ASSERT_TRUE(status.ok());
std::cout << "Time taken for extractYear " << elapsed_millis << " ms\n";

EXPECT_LE(elapsed_millis, 11 * tolerance_ratio);
}

TEST_F(TestBenchmarks, TimedTestFilterAdd2) {
// schema for input fields
auto field0 = field("f0", int64());
auto field1 = field("f1", int64());
auto field2 = field("f2", int64());
auto schema = arrow::schema({field0, field1, field2});

// Build expression
auto sum = TreeExprBuilder::MakeFunction(
"add", {TreeExprBuilder::MakeField(field1), TreeExprBuilder::MakeField(field2)},
int64());
auto less_than = TreeExprBuilder::MakeFunction(
"less_than", {sum, TreeExprBuilder::MakeField(field2)}, boolean());
auto condition = TreeExprBuilder::MakeCondition(less_than);

std::shared_ptr<Filter> filter;
Status status = Filter::Make(schema, condition, &filter);
EXPECT_TRUE(status.ok());

int64_t elapsed_millis;
Int64DataGenerator data_generator;
FilterEvaluator evaluator(filter);

status = TimedEvaluate<arrow::Int64Type, int64_t>(
schema, evaluator, data_generator, pool_, MILLION, 16 * THOUSAND, elapsed_millis);
ASSERT_TRUE(status.ok());
std::cout << "Time taken for Filter with Add2 " << elapsed_millis << " ms\n";

EXPECT_LE(elapsed_millis, 2 * tolerance_ratio);
}

TEST_F(TestBenchmarks, TimedTestFilterLike) {
// schema for input fields
auto fielda = field("a", utf8());
auto schema = arrow::schema({fielda});

// build expression.
auto node_a = TreeExprBuilder::MakeField(fielda);
auto pattern_node = TreeExprBuilder::MakeStringLiteral("%yellow%");
auto like_yellow =
TreeExprBuilder::MakeFunction("like", {node_a, pattern_node}, arrow::boolean());
auto condition = TreeExprBuilder::MakeCondition(like_yellow);

std::shared_ptr<Filter> filter;
Status status = Filter::Make(schema, condition, &filter);
EXPECT_TRUE(status.ok());

int64_t elapsed_millis;
FastUtf8DataGenerator data_generator(32);
FilterEvaluator evaluator(filter);

status = TimedEvaluate<arrow::StringType, std::string>(
schema, evaluator, data_generator, pool_, 1 * MILLION, 16 * THOUSAND,
elapsed_millis);
ASSERT_TRUE(status.ok());
std::cout << "Time taken for Filter with like " << elapsed_millis << " ms\n";

EXPECT_LE(elapsed_millis, 20000 * tolerance_ratio);
}

} // namespace gandiva
43 changes: 40 additions & 3 deletions cpp/src/gandiva/integ/timed_evaluate.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <memory>
#include <vector>
#include "gandiva/arrow.h"
#include "gandiva/filter.h"
#include "gandiva/projector.h"
#include "integ/generate_data.h"

Expand All @@ -37,8 +38,45 @@ std::vector<C_TYPE> GenerateData(int num_records, DataGenerator<C_TYPE> &data_ge
return data;
}

class BaseEvaluator {
public:
virtual Status Evaluate(arrow::RecordBatch &batch, arrow::MemoryPool *pool) = 0;
};

class ProjectEvaluator : public BaseEvaluator {
public:
ProjectEvaluator(std::shared_ptr<Projector> projector) : projector_(projector) {}

Status Evaluate(arrow::RecordBatch &batch, arrow::MemoryPool *pool) override {
arrow::ArrayVector outputs;
return projector_->Evaluate(batch, pool, &outputs);
}

private:
std::shared_ptr<Projector> projector_;
};

class FilterEvaluator : public BaseEvaluator {
public:
FilterEvaluator(std::shared_ptr<Filter> filter) : filter_(filter) {}

Status Evaluate(arrow::RecordBatch &batch, arrow::MemoryPool *pool) override {
if (selection_ == nullptr || selection_->GetMaxSlots() < batch.num_rows()) {
auto status = SelectionVector::MakeInt16(batch.num_rows(), pool, &selection_);
if (!status.ok()) {
return status;
}
}
return filter_->Evaluate(batch, selection_);
}

private:
std::shared_ptr<Filter> filter_;
std::shared_ptr<SelectionVector> selection_;
};

template <typename TYPE, typename C_TYPE>
Status TimedEvaluate(SchemaPtr schema, std::shared_ptr<Projector> projector,
Status TimedEvaluate(SchemaPtr schema, BaseEvaluator &evaluator,
DataGenerator<C_TYPE> &data_generator, arrow::MemoryPool *pool,
int num_records, int batch_size, int64_t &num_millis) {
int num_remaining = num_records;
Expand Down Expand Up @@ -69,9 +107,8 @@ Status TimedEvaluate(SchemaPtr schema, std::shared_ptr<Projector> projector,
auto in_batch = arrow::RecordBatch::Make(schema, num_in_batch, columns);

// evaluate
arrow::ArrayVector outputs;
start = std::chrono::high_resolution_clock::now();
status = projector->Evaluate(*in_batch, pool, &outputs);
status = evaluator.Evaluate(*in_batch, pool);
finish = std::chrono::high_resolution_clock::now();
if (!status.ok()) {
return status;
Expand Down
1 change: 1 addition & 0 deletions cpp/src/gandiva/precompiled/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ add_custom_target(precompiled ALL DEPENDS ${GANDIVA_BC_OUTPUT_PATH})

# testing
add_precompiled_unit_test(bitmap_test.cc bitmap.cc)
add_precompiled_unit_test(epoch_time_point_test.cc)
add_precompiled_unit_test(time_test.cc time.cc timestamp_arithmetic.cc)
add_precompiled_unit_test(hash_test.cc hash.cc)
add_precompiled_unit_test(sample_test.cc sample.cc)
Expand Down
Loading

0 comments on commit a0e2708

Please sign in to comment.