Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions cpp/src/arrow/compute/api_scalar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,8 @@ static auto kStrftimeOptionsType = GetFunctionOptionsType<StrftimeOptions>(
DataMember("format", &StrftimeOptions::format));
static auto kStrptimeOptionsType = GetFunctionOptionsType<StrptimeOptions>(
DataMember("format", &StrptimeOptions::format),
DataMember("unit", &StrptimeOptions::unit));
DataMember("unit", &StrptimeOptions::unit),
DataMember("error_is_null", &StrptimeOptions::error_is_null));
static auto kStructFieldOptionsType = GetFunctionOptionsType<StructFieldOptions>(
DataMember("indices", &StructFieldOptions::indices));
static auto kTrimOptionsType = GetFunctionOptionsType<TrimOptions>(
Expand Down Expand Up @@ -544,11 +545,13 @@ StrftimeOptions::StrftimeOptions() : StrftimeOptions(kDefaultFormat) {}
constexpr char StrftimeOptions::kTypeName[];
constexpr const char* StrftimeOptions::kDefaultFormat;

StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit)
StrptimeOptions::StrptimeOptions(std::string format, TimeUnit::type unit,
bool error_is_null)
: FunctionOptions(internal::kStrptimeOptionsType),
format(std::move(format)),
unit(unit) {}
StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::SECOND) {}
unit(unit),
error_is_null(error_is_null) {}
StrptimeOptions::StrptimeOptions() : StrptimeOptions("", TimeUnit::MICRO, false) {}
constexpr char StrptimeOptions::kTypeName[];

StructFieldOptions::StructFieldOptions(std::vector<int> indices)
Expand Down Expand Up @@ -822,6 +825,10 @@ Result<Datum> Strftime(const Datum& arg, StrftimeOptions options, ExecContext* c
return CallFunction("strftime", {arg}, &options, ctx);
}

Result<Datum> Strptime(const Datum& arg, StrptimeOptions options, ExecContext* ctx) {
return CallFunction("strptime", {arg}, &options, ctx);
}

Result<Datum> Week(const Datum& arg, WeekOptions options, ExecContext* ctx) {
return CallFunction("week", {arg}, &options, ctx);
}
Expand Down
23 changes: 22 additions & 1 deletion cpp/src/arrow/compute/api_scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,12 +267,17 @@ class ARROW_EXPORT StructFieldOptions : public FunctionOptions {

class ARROW_EXPORT StrptimeOptions : public FunctionOptions {
public:
explicit StrptimeOptions(std::string format, TimeUnit::type unit);
explicit StrptimeOptions(std::string format, TimeUnit::type unit,
bool error_is_null = false);
StrptimeOptions();
static constexpr char const kTypeName[] = "StrptimeOptions";

/// The desired format string.
std::string format;
/// The desired time resolution
TimeUnit::type unit;
/// Return null on parsing errors if true or raise if false
bool error_is_null;
};

class ARROW_EXPORT StrftimeOptions : public FunctionOptions {
Expand Down Expand Up @@ -1398,6 +1403,22 @@ ARROW_EXPORT Result<Datum> Subsecond(const Datum& values, ExecContext* ctx = NUL
ARROW_EXPORT Result<Datum> Strftime(const Datum& values, StrftimeOptions options,
ExecContext* ctx = NULLPTR);

/// \brief Parse timestamps according to a format string
///
/// Return parsed timestamps according to the format string
/// `StrptimeOptions::format` at time resolution `Strftime::unit`. Parse errors are
/// raised depending on the `Strftime::error_is_null` setting.
///
/// \param[in] values input strings
/// \param[in] options for setting format string, unit and error_is_null
/// \param[in] ctx the function execution context, optional
/// \return the resulting datum
///
/// \since 8.0.0
/// \note API not yet finalized
ARROW_EXPORT Result<Datum> Strptime(const Datum& values, StrptimeOptions options,
ExecContext* ctx = NULLPTR);

/// \brief Converts timestamps from local timestamp without a timezone to a timestamp with
/// timezone, interpreting the local timestamp as being in the specified timezone for each
/// element of `values`
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/exec/expression_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -737,7 +737,7 @@ TEST(Expression, ExecuteCall) {
])"));

ExpectExecute(call("strptime", {field_ref("a")},
compute::StrptimeOptions("%m/%d/%Y", TimeUnit::MICRO)),
compute::StrptimeOptions("%m/%d/%Y", TimeUnit::MICRO, true)),
ArrayFromJSON(struct_({field("a", utf8())}), R"([
{"a": "5/1/2020"},
{"a": null},
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/function_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ TEST(FunctionOptions, Equality) {
options.emplace_back(new ExtractRegexOptions("pattern2"));
options.emplace_back(new SetLookupOptions(ArrayFromJSON(int64(), "[1, 2, 3, 4]")));
options.emplace_back(new SetLookupOptions(ArrayFromJSON(boolean(), "[true, false]")));
options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::MILLI));
options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::MILLI, true));
options.emplace_back(new StrptimeOptions("%Y", TimeUnit::type::NANO));
options.emplace_back(new StrftimeOptions("%Y-%m-%dT%H:%M:%SZ", "C"));
#ifndef _WIN32
Expand Down
74 changes: 0 additions & 74 deletions cpp/src/arrow/compute/kernels/scalar_string_ascii.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2770,79 +2770,6 @@ void AddAsciiStringSplitRegex(FunctionRegistry* registry) {
}
#endif // ARROW_WITH_RE2

// ----------------------------------------------------------------------
// strptime string parsing

using StrptimeState = OptionsWrapper<StrptimeOptions>;

struct ParseStrptime {
explicit ParseStrptime(const StrptimeOptions& options)
: parser(TimestampParser::MakeStrptime(options.format)), unit(options.unit) {}

template <typename... Ignored>
int64_t Call(KernelContext*, util::string_view val, Status* st) const {
int64_t result = 0;
if (!(*parser)(val.data(), val.size(), unit, &result)) {
*st = Status::Invalid("Failed to parse string: '", val, "' as a scalar of type ",
TimestampType(unit).ToString());
}
return result;
}

std::shared_ptr<TimestampParser> parser;
TimeUnit::type unit;
};

template <typename InputType>
struct StrptimeExec {
static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
applicator::ScalarUnaryNotNullStateful<TimestampType, InputType, ParseStrptime>
kernel{ParseStrptime(StrptimeState::Get(ctx))};
return kernel.Exec(ctx, batch, out);
}
};

Result<ValueDescr> ResolveStrptimeOutput(KernelContext* ctx,
const std::vector<ValueDescr>&) {
if (!ctx->state()) {
return Status::Invalid("strptime does not provide default StrptimeOptions");
}
const StrptimeOptions& options = StrptimeState::Get(ctx);
// Check for use of %z or %Z
size_t cur = 0;
std::string zone = "";
while (cur < options.format.size() - 1) {
if (options.format[cur] == '%') {
if (options.format[cur + 1] == 'z') {
zone = "UTC";
break;
}
cur++;
}
cur++;
}
return ::arrow::timestamp(options.unit, zone);
}

const FunctionDoc strptime_doc(
"Parse timestamps",
("For each string in `strings`, parse it as a timestamp.\n"
"The timestamp unit and the expected string pattern must be given\n"
"in StrptimeOptions. Null inputs emit null. If a non-null string\n"
"fails parsing, an error is returned."),
{"strings"}, "StrptimeOptions", /*options_required=*/true);

void AddAsciiStringStrptime(FunctionRegistry* registry) {
auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);

OutputType out_ty(ResolveStrptimeOutput);
for (const auto& ty : StringTypes()) {
auto exec = GenerateVarBinaryToVarBinary<StrptimeExec>(ty);
DCHECK_OK(func->AddKernel({ty}, out_ty, std::move(exec), StrptimeState::Init));
}
DCHECK_OK(registry->AddFunction(std::move(func)));
}

// ----------------------------------------------------------------------
// Binary join

Expand Down Expand Up @@ -3518,7 +3445,6 @@ void RegisterScalarStringAscii(FunctionRegistry* registry) {
#ifdef ARROW_WITH_RE2
AddAsciiStringSplitRegex(registry);
#endif
AddAsciiStringStrptime(registry);
AddAsciiStringJoin(registry);
AddAsciiStringRepeat(registry);
}
Expand Down
34 changes: 27 additions & 7 deletions cpp/src/arrow/compute/kernels/scalar_string_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1840,14 +1840,34 @@ TYPED_TEST(TestBaseBinaryKernels, ExtractRegexInvalid) {
#endif

TYPED_TEST(TestStringKernels, Strptime) {
std::string input1 = R"(["5/1/2020", null, "12/11/1900"])";
std::string output1 = R"(["2020-05-01", null, "1900-12-11"])";
StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO);
this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
std::string input1 = R"(["5/1/2020", null, null, "12/13/1900", null])";
std::string input2 = R"(["5/1/2020", "12/13/1900"])";
std::string input3 = R"(["5/1/2020", "AA/BB/CCCC"])";
std::string input4 = R"(["5/1/2020", "AA/BB/CCCC", "AA/BB/CCCC", "AA/BB/CCCC", null])";
std::string input5 = R"(["5/1/2020 %z", null, null, "12/13/1900 %z", null])";
std::string output1 = R"(["2020-05-01", null, null, "1900-12-13", null])";
std::string output4 = R"(["2020-01-05", null, null, null, null])";
std::string output2 = R"(["2020-05-01", "1900-12-13"])";
std::string output3 = R"(["2020-05-01", null])";

StrptimeOptions options("%m/%d/%Y", TimeUnit::MICRO, /*error_is_null=*/true);
auto unit = timestamp(TimeUnit::MICRO);
this->CheckUnary("strptime", input1, unit, output1, &options);
this->CheckUnary("strptime", input2, unit, output2, &options);
this->CheckUnary("strptime", input3, unit, output3, &options);

options.format = "%d/%m/%Y";
this->CheckUnary("strptime", input4, unit, output4, &options);

input1 = R"(["5/1/2020 %z", null, "12/11/1900 %z"])";
options.format = "%m/%d/%Y %%z";
this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO), output1, &options);
this->CheckUnary("strptime", input5, unit, output1, &options);

options.error_is_null = false;
this->CheckUnary("strptime", input5, unit, output1, &options);

EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, testing::HasSubstr("Invalid: Failed to parse string: '5/1/2020'"),
Strptime(ArrayFromJSON(this->type(), input1), options));
}

TYPED_TEST(TestStringKernels, StrptimeZoneOffset) {
Expand All @@ -1859,7 +1879,7 @@ TYPED_TEST(TestStringKernels, StrptimeZoneOffset) {
std::string input1 = R"(["5/1/2020 +0100", null, "12/11/1900 -0130"])";
std::string output1 =
R"(["2020-04-30T23:00:00.000000", null, "1900-12-11T01:30:00.000000"])";
StrptimeOptions options("%m/%d/%Y %z", TimeUnit::MICRO);
StrptimeOptions options("%m/%d/%Y %z", TimeUnit::MICRO, /*error_is_null=*/true);
this->CheckUnary("strptime", input1, timestamp(TimeUnit::MICRO, "UTC"), output1,
&options);
}
Expand Down
Loading