diff --git a/cpp/src/arrow/util/value_parsing.h b/cpp/src/arrow/util/value_parsing.h index f0f5214d856b5..6c46433c57051 100644 --- a/cpp/src/arrow/util/value_parsing.h +++ b/cpp/src/arrow/util/value_parsing.h @@ -561,6 +561,39 @@ static inline bool ParseTimestampStrptime(const char* buf, size_t length, return true; } +/// \brief Returns time since the UNIX epoch in the requested unit. Takes null terminated +/// buffer as argument +static inline bool ParseTimestampStrptimeFromCString(const char* buf, size_t length, + const char* format, + bool ignore_time_in_day, + bool allow_trailing_chars, + TimeUnit::type unit, int64_t* out) { + // NOTE: strptime() is more than 10x faster than arrow_vendored::date::parse(). + struct tm result; + memset(&result, 0, sizeof(struct tm)); +#ifdef _WIN32 + char* ret = arrow_strptime(buf, format, &result); +#else + char* ret = strptime(buf, format, &result); +#endif + if (ret == NULLPTR) { + return false; + } + if (!allow_trailing_chars && static_cast(ret - buf) != length) { + return false; + } + // ignore the time part + arrow_vendored::date::sys_seconds secs = + arrow_vendored::date::sys_days(arrow_vendored::date::year(result.tm_year + 1900) / + (result.tm_mon + 1) / result.tm_mday); + if (!ignore_time_in_day) { + secs += (std::chrono::hours(result.tm_hour) + std::chrono::minutes(result.tm_min) + + std::chrono::seconds(result.tm_sec)); + } + *out = detail::ConvertTimePoint(secs, unit); + return true; +} + /// \brief Parsing options for timestamps struct ParseTimestampContext { TimeUnit::type unit; diff --git a/cpp/src/gandiva/tests/generate_data.h b/cpp/src/gandiva/tests/generate_data.h index bd7bbb8e7a368..32a5bbe3c591b 100644 --- a/cpp/src/gandiva/tests/generate_data.h +++ b/cpp/src/gandiva/tests/generate_data.h @@ -129,4 +129,16 @@ class FastUtf8DataGenerator : public DataGenerator { char cur_char_; }; +class Utf8DateDataGenerator : public DataGenerator { + public: + Utf8DateDataGenerator() {} + + std::string GenerateData() { + return "1990-0" + std::to_string(random_.next() / 9 + 1) + "-1" + + std::to_string(random_.next() / 9 + 1); + } + + private: + Random random_; +}; } // namespace gandiva diff --git a/cpp/src/gandiva/tests/micro_benchmarks.cc b/cpp/src/gandiva/tests/micro_benchmarks.cc index 6b4dddf994b15..aec9f8a62ccda 100644 --- a/cpp/src/gandiva/tests/micro_benchmarks.cc +++ b/cpp/src/gandiva/tests/micro_benchmarks.cc @@ -277,6 +277,31 @@ static void TimedTestInExpr(benchmark::State& state) { ASSERT_OK(status); } +static void TimedTestToDate(benchmark::State& state) { + auto field_a = field("a", utf8()); + auto schema = arrow::schema({field_a}); + auto pool = arrow::default_memory_pool(); + + auto field_result = field("res", arrow::date64()); + + auto node_a = TreeExprBuilder::MakeField(field_a); + auto date_pattern = TreeExprBuilder::MakeStringLiteral("YYYY-MM-DD"); + auto suppress_literal = TreeExprBuilder::MakeLiteral(1); + auto fn = TreeExprBuilder::MakeFunction( + "to_date", {node_a, date_pattern, suppress_literal}, arrow::date64()); + auto expr = TreeExprBuilder::MakeExpression(fn, field_result); + + std::shared_ptr projector; + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); + + Utf8DateDataGenerator data_generator; + ProjectEvaluator evaluator(projector); + + Status status = TimedEvaluate( + schema, evaluator, data_generator, pool, 1 * MILLION, 16 * THOUSAND, state); + ASSERT_TRUE(status.ok()); +} + static void DoDecimalAdd3(benchmark::State& state, int32_t precision, int32_t scale, bool large = false) { // schema for input fields @@ -398,6 +423,7 @@ BENCHMARK(TimedTestFilterLike)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestAllocs)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestMultiOr)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(TimedTestInExpr)->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(TimedTestToDate)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(DecimalAdd2Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(DecimalAdd2LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(DecimalAdd2LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond); @@ -406,5 +432,4 @@ BENCHMARK(DecimalAdd3Fast)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(DecimalAdd3LeadingZeroes)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(DecimalAdd3LeadingZeroesWithDiv)->MinTime(1.0)->Unit(benchmark::kMicrosecond); BENCHMARK(DecimalAdd3Large)->MinTime(1.0)->Unit(benchmark::kMicrosecond); - } // namespace gandiva diff --git a/cpp/src/gandiva/to_date_holder.cc b/cpp/src/gandiva/to_date_holder.cc index fe5e470ca7b0b..76efa3a7b8e81 100644 --- a/cpp/src/gandiva/to_date_holder.cc +++ b/cpp/src/gandiva/to_date_holder.cc @@ -83,7 +83,7 @@ int64_t ToDateHolder::operator()(ExecutionContext* context, const std::string& d // 1. processes date that do not match the format. // 2. does not process time in format +08:00 (or) id. int64_t seconds_since_epoch = 0; - if (!::arrow::internal::ParseTimestampStrptime( + if (!::arrow::internal::ParseTimestampStrptimeFromCString( data.c_str(), data.length(), pattern_.c_str(), /*ignore_time_in_day=*/true, /*allow_trailing_chars=*/true, ::arrow::TimeUnit::SECOND, &seconds_since_epoch)) {