Skip to content

Commit

Permalink
Add method to parse date from null-terminated string
Browse files Browse the repository at this point in the history
  • Loading branch information
projjal committed Jun 11, 2020
1 parent a262cf1 commit e722790
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 2 deletions.
33 changes: 33 additions & 0 deletions cpp/src/arrow/util/value_parsing.h
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,39 @@ static inline bool ParseTimestampStrptime(const char* buf, size_t length,
return true;
}

/// \brief Returns time since the UNIX epoch in the requested unit. Takes null terminated
/// buffer as argument
static inline bool ParseTimestampStrptimeFromCString(const char* buf, size_t length,
const char* format,
bool ignore_time_in_day,
bool allow_trailing_chars,
TimeUnit::type unit, int64_t* out) {
// NOTE: strptime() is more than 10x faster than arrow_vendored::date::parse().
struct tm result;
memset(&result, 0, sizeof(struct tm));
#ifdef _WIN32
char* ret = arrow_strptime(buf, format, &result);
#else
char* ret = strptime(buf, format, &result);
#endif
if (ret == NULLPTR) {
return false;
}
if (!allow_trailing_chars && static_cast<size_t>(ret - buf) != length) {
return false;
}
// ignore the time part
arrow_vendored::date::sys_seconds secs =
arrow_vendored::date::sys_days(arrow_vendored::date::year(result.tm_year + 1900) /
(result.tm_mon + 1) / result.tm_mday);
if (!ignore_time_in_day) {
secs += (std::chrono::hours(result.tm_hour) + std::chrono::minutes(result.tm_min) +
std::chrono::seconds(result.tm_sec));
}
*out = detail::ConvertTimePoint(secs, unit);
return true;
}

/// \brief Parsing options for timestamps
struct ParseTimestampContext {
TimeUnit::type unit;
Expand Down
12 changes: 12 additions & 0 deletions cpp/src/gandiva/tests/generate_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,4 +129,16 @@ class FastUtf8DataGenerator : public DataGenerator<std::string> {
char cur_char_;
};

class Utf8DateDataGenerator : public DataGenerator<std::string> {
public:
Utf8DateDataGenerator() {}

std::string GenerateData() {
return "1990-0" + std::to_string(random_.next() / 9 + 1) + "-1" +
std::to_string(random_.next() / 9 + 1);
}

private:
Random random_;
};
} // namespace gandiva
27 changes: 26 additions & 1 deletion cpp/src/gandiva/tests/micro_benchmarks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,31 @@ static void TimedTestInExpr(benchmark::State& state) {
ASSERT_OK(status);
}

static void TimedTestToDate(benchmark::State& state) {
auto field_a = field("a", utf8());
auto schema = arrow::schema({field_a});
auto pool = arrow::default_memory_pool();

auto field_result = field("res", arrow::date64());

auto node_a = TreeExprBuilder::MakeField(field_a);
auto date_pattern = TreeExprBuilder::MakeStringLiteral("YYYY-MM-DD");
auto suppress_literal = TreeExprBuilder::MakeLiteral(1);
auto fn = TreeExprBuilder::MakeFunction(
"to_date", {node_a, date_pattern, suppress_literal}, arrow::date64());
auto expr = TreeExprBuilder::MakeExpression(fn, field_result);

std::shared_ptr<Projector> projector;
ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector));

Utf8DateDataGenerator data_generator;
ProjectEvaluator evaluator(projector);

Status status = TimedEvaluate<arrow::StringType, std::string>(
schema, evaluator, data_generator, pool, 1 * MILLION, 16 * THOUSAND, state);
ASSERT_TRUE(status.ok());
}

static void DoDecimalAdd3(benchmark::State& state, int32_t precision, int32_t scale,
bool large = false) {
// schema for input fields
Expand Down Expand Up @@ -398,6 +423,7 @@ BENCHMARK(TimedTestFilterLike)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(TimedTestAllocs)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(TimedTestMultiOr)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(TimedTestInExpr)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(TimedTestToDate)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(DecimalAdd2Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(DecimalAdd2LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(DecimalAdd2LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
Expand All @@ -406,5 +432,4 @@ BENCHMARK(DecimalAdd3Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(DecimalAdd3LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(DecimalAdd3LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond);
BENCHMARK(DecimalAdd3Large)->MinTime(1.0)->Unit(benchmark::kMicrosecond);

} // namespace gandiva
2 changes: 1 addition & 1 deletion cpp/src/gandiva/to_date_holder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ int64_t ToDateHolder::operator()(ExecutionContext* context, const std::string& d
// 1. processes date that do not match the format.
// 2. does not process time in format +08:00 (or) id.
int64_t seconds_since_epoch = 0;
if (!::arrow::internal::ParseTimestampStrptime(
if (!::arrow::internal::ParseTimestampStrptimeFromCString(
data.c_str(), data.length(), pattern_.c_str(),
/*ignore_time_in_day=*/true, /*allow_trailing_chars=*/true,
::arrow::TimeUnit::SECOND, &seconds_since_epoch)) {
Expand Down

0 comments on commit e722790

Please sign in to comment.