Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

str_to_date function push down (for poc) #1960

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
726 changes: 725 additions & 1 deletion dbms/src/Common/MyTime.cpp

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions dbms/src/Common/MyTime.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <Core/Field.h>
#include <common/DateLUTImpl.h>

struct StringRef;
namespace DB
{

Expand Down Expand Up @@ -129,6 +130,24 @@ struct MyDateTimeFormatter
}
};

struct MyDateTimeParser
{
explicit MyDateTimeParser(const String & format_);

std::optional<UInt64> parseAsPackedUInt(const StringRef & str_view) const;

struct Context;

private:
const String format;

// Parsing method. Parse from ctx.view[ctx.pos].
// If success, update `datetime`, `ctx` and return true.
// If fail, return false.
using ParserCallback = std::function<bool(MyDateTimeParser::Context & ctx, MyTimeBase & datetime)>;
std::vector<ParserCallback> parsers;
};

Field parseMyDateTime(const String & str, int8_t fsp = 6);

void convertTimeZone(UInt64 from_time, UInt64 & to_time, const DateLUTImpl & time_zone_from, const DateLUTImpl & time_zone_to);
Expand Down
30 changes: 30 additions & 0 deletions dbms/src/Common/StringUtils/StringRefUtils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#pragma once

#include <Common/StringUtils/StringUtils.h>
#include <common/StringRef.h>

inline bool startsWith(const StringRef & view, const StringRef & prefix)
{
return detail::startsWith(view.data, view.size, prefix.data, prefix.size);
}

// case insensitive version of startsWith
inline bool startsWithCI(const StringRef & view, const StringRef & prefix)
{
return detail::startsWithCI(view.data, view.size, prefix.data, prefix.size);
}

inline bool endsWith(const StringRef & view, const char * prefix)
{
return detail::endsWith(view.data, view.size, prefix, strlen(prefix)); //
}

// case insensitive version of endsWith
inline bool endsWithCI(const StringRef & view, const char * prefix)
{
return detail::endsWithCI(view.data, view.size, prefix, strlen(prefix));
}

// n - number of characters to remove from the start of the view,
// The behavior is undefined if `n > view.size`
inline StringRef removePrefix(const StringRef & view, size_t n) { return StringRef{view.data + n, view.size - n}; }
36 changes: 32 additions & 4 deletions dbms/src/Common/StringUtils/StringUtils.cpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,44 @@
#include "StringUtils.h"

#include <cctype>

namespace detail
{

bool startsWith(const std::string & s, const char * prefix, size_t prefix_size)
bool startsWith(const char * s, size_t size, const char * prefix, size_t prefix_size)
{
return size >= prefix_size && 0 == memcmp(s, prefix, prefix_size);
}

bool endsWith(const char * s, size_t size, const char * suffix, size_t suffix_size)
{
return s.size() >= prefix_size && 0 == memcmp(s.data(), prefix, prefix_size);
return size >= suffix_size && 0 == memcmp(s + size - suffix_size, suffix, suffix_size);
}

bool endsWith(const std::string & s, const char * suffix, size_t suffix_size)
bool startsWithCI(const char * s, size_t size, const char * prefix, size_t prefix_size)
{
return s.size() >= suffix_size && 0 == memcmp(s.data() + s.size() - suffix_size, suffix, suffix_size);
if (size < prefix_size)
return false;
// case insensitive compare
for (size_t i = 0; i < prefix_size; ++i)
{
if (std::tolower(s[i]) != std::tolower(prefix[i]))
return false;
}
return true;
}

bool endsWithCI(const char * s, size_t size, const char * suffix, size_t suffix_size)
{
if (size < suffix_size)
return false;
// case insensitive compare
for (size_t i = 0; i < suffix_size; ++i)
{
if (std::tolower(s[i]) != std::tolower(suffix[i]))
return false;
}
return true;
}

} // namespace detail
19 changes: 12 additions & 7 deletions dbms/src/Common/StringUtils/StringUtils.h
Original file line number Diff line number Diff line change
@@ -1,38 +1,43 @@
#pragma once

#include <string>
#include <string_view>
#include <cstring>
#include <cstddef>


namespace detail
{
bool startsWith(const std::string & s, const char * prefix, size_t prefix_size);
bool endsWith(const std::string & s, const char * suffix, size_t suffix_size);
}
bool startsWith(const char * s, size_t size, const char * prefix, size_t prefix_size);
bool endsWith(const char * s, size_t size, const char * suffix, size_t suffix_size);

// case insensitive version
bool startsWithCI(const char * s, size_t size, const char * prefix, size_t prefix_size);
bool endsWithCI(const char * s, size_t size, const char * suffix, size_t suffix_size);
} // namespace detail


inline bool startsWith(const std::string & s, const std::string & prefix)
{
return detail::startsWith(s, prefix.data(), prefix.size());
return detail::startsWith(s.data(), s.size(), prefix.data(), prefix.size());
}

inline bool endsWith(const std::string & s, const std::string & suffix)
{
return detail::endsWith(s, suffix.data(), suffix.size());
return detail::endsWith(s.data(), s.size(), suffix.data(), suffix.size());
}


/// With GCC, strlen is evaluated compile time if we pass it a constant
/// string that is known at compile time.
inline bool startsWith(const std::string & s, const char * prefix)
{
return detail::startsWith(s, prefix, strlen(prefix));
return detail::startsWith(s.data(), s.size(), prefix, strlen(prefix));
}

inline bool endsWith(const std::string & s, const char * suffix)
{
return detail::endsWith(s, suffix, strlen(suffix));
return detail::endsWith(s.data(), s.size(), suffix, strlen(suffix)); //
}

/// Given an integer, return the adequate suffix for
Expand Down
161 changes: 160 additions & 1 deletion dbms/src/Common/tests/gtest_mytime.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#include <Common/Exception.h>
#include <Common/MyTime.h>
#include <DataTypes/DataTypeMyDateTime.h>
#include <gtest/gtest.h>
#include <TestUtils/TiFlashTestBasic.h>

#include <iostream>
#include <string>
Expand Down Expand Up @@ -153,6 +153,165 @@ catch (Exception & e)
GTEST_FAIL();
}

TEST_F(TestMyTime, Parser)
try
{
std::vector<std::tuple<String, String, std::optional<MyDateTime>>> cases{
{" 2/Jun", "%d/%b/%Y", MyDateTime{0, 6, 2, 0, 0, 0, 0}}, // More patterns than input string
{" liter", "lit era l", MyDateTime{0, 0, 0, 0, 0, 0, 0}}, // More patterns than input string
// Test case for empty input
{" ", " ", MyDateTime{0, 0, 0, 0, 0, 0, 0}},
{" ", "%d/%b/%Y", MyDateTime{0, 0, 0, 0, 0, 0, 0}},
// Prefix white spaces should be ignored
{" 2/Jun/2019 ", "%d/%b/%Y", MyDateTime{2019, 6, 2, 0, 0, 0, 0}},
{" 2/Jun/2019 ", " %d/%b/%Y", MyDateTime{2019, 6, 2, 0, 0, 0, 0}},
//
{"31/May/2016 12:34:56.1234", "%d/%b/%Y %H:%i:%S.%f", MyDateTime{2016, 5, 31, 12, 34, 56, 123400}},
{"31/may/2016 12:34:56.1234", "%d/%b/%Y %H:%i:%S.%f", MyDateTime{2016, 5, 31, 12, 34, 56, 123400}}, // case insensitive
{"31/mayy/2016 12:34:56.1234", "%d/%b/%Y %H:%i:%S.%f", std::nullopt}, // invalid %b
{"31/mey/2016 12:34:56.1234", "%d/%b/%Y %H:%i:%S.%f", std::nullopt}, // invalid %b
{"30/April/2016 12:34:56.", "%d/%M/%Y %H:%i:%s.%f", MyDateTime{2016, 4, 30, 12, 34, 56, 0}}, // empty %f is valid
{"30/april/2016 12:34:56.", "%d/%M/%Y %H:%i:%s.%f", MyDateTime{2016, 4, 30, 12, 34, 56, 0}}, // case insensitive
{"30/Apri/2016 12:34:56.", "%d/%M/%Y %H:%i:%s.%f", std::nullopt}, // invalid %M
{"30/Aprill/2016 12:34:56.", "%d/%M/%Y %H:%i:%s.%f", std::nullopt}, // invalid %M
{"30/Feb/2016 12:34:56.1234", "%d/%b/%Y %H:%i:%S.%f",
MyDateTime{2016, 2, 30, 12, 34, 56, 123400}}, // Feb 30th (not exist in actual) is valid for parsing (in mariadb)
{"31/April/2016 12:34:56.", "%d/%M/%Y %H:%i:%s.%f", MyDateTime{2016, 4, 31, 12, 34, 56, 0}}, // April 31th (not exist in actual)
{"01,5,2013 9", "%d,%c,%Y %f", MyDateTime{2013, 5, 1, 0, 0, 0, 900000}},
{"01,52013", "%d,%c%Y", std::nullopt}, // %c will try to parse '52' as month and fail
{"01,5,2013", "%d,%c,%Y", MyDateTime{2013, 5, 1, 0, 0, 0, 0}},
{"01,5,2013 ", "%d,%c,%Y %f", MyDateTime{2013, 5, 1, 0, 0, 0, 0}},

/// Test cases for AM/PM set
{"10:11:12 AM", "%H:%i:%S %p", std::nullopt}, // should not set %H %p at the same time
{"10:11:12 Am", "%h:%i:%S %p", MyDateTime(0, 0, 0, 10, 11, 12, 0)},
{"10:11:12 A", "%h:%i:%S %p", std::nullopt}, // EOF while parsing "AM"/"PM"
{"00:11:12 AM", "%h:%i:%S %p", std::nullopt}, // should not happen: %p set, %h not set
{"11:12 AM", "%i:%S %p", std::nullopt}, // should not happen: %p set, %h not set
{"11:12 abcd", "%i:%S ", MyDateTime{0, 0, 0, 0, 11, 12, 0}}, // without %p, %h not set is ok
{"00:11:12 ", "%h:%i:%S ", std::nullopt}, // 0 is not a valid number of %h
{"12:11:12 AP", "%h:%i:%S %p", std::nullopt}, // only AM/PM is valid
{"12:11:12 AM", "%h:%i:%S %p", MyDateTime(0, 0, 0, 0, 11, 12, 0)},
{"12:11:12 PM", "%h:%i:%S %p", MyDateTime(0, 0, 0, 12, 11, 12, 0)},
{"11:11:12 pM", "%h:%i:%S %p", MyDateTime(0, 0, 0, 23, 11, 12, 0)},
/// Special case for %h with 12
{"12:11:23 ", "%h:%i:%S ", MyDateTime(0, 0, 0, 0, 11, 23, 0)},
// For %% -- FIXME: Ignored by now, both tidb and mariadb 10.3.14 can not handle it
// {"01/Feb/2016 % 23:45:54", "%d/%b/%Y %% %H:%i:%S", MyDateTime(2016, 2, 1, 23, 45, 54, 0)},
// {"01/Feb/2016 %% 23:45:54", "%d/%b/%Y %%%% %H:%i:%S", MyDateTime(2016, 2, 1, 23, 45, 54, 0)},
{"01/Feb/2016 % 23:45:54", "%d/%b/%Y %% %H:%i:%S", std::nullopt},
{"01/Feb/2016 %% 23:45:54", "%d/%b/%Y %%%% %H:%i:%S", std::nullopt},

/// Test cases for %r
{" 04 :13:56 AM13/05/2019", "%r %d/%c/%Y", MyDateTime{2019, 5, 13, 4, 13, 56, 0}},
{"13:13:56 AM13/5/2019", "%r", std::nullopt}, // hh = 13 with am is invalid
{"00:13:56 AM13/05/2019", "%r", std::nullopt}, // hh = 0 with am is invalid
{"00:13:56 pM13/05/2019", "%r", std::nullopt}, // hh = 0 with pm is invalid
{"12: 13:56 AM 13/05/2019", "%r%d/%c/%Y", MyDateTime{2019, 5, 13, 0, 13, 56, 0}},
{"12:13 :56 pm 13/05/2019", "%r %d/%c/%Y", MyDateTime{2019, 5, 13, 12, 13, 56, 0}},
{"11:13: 56pm 13/05/2019", "%r %d/%c/%Y", MyDateTime{2019, 5, 13, 23, 13, 56, 0}},
{"11:13:56a", "%r", std::nullopt}, // EOF while parsing "AM"/"PM"
{"11:13", "%r", MyDateTime{0, 0, 0, 11, 13, 0, 0}},
{"11:", "%r", MyDateTime{0, 0, 0, 11, 0, 0, 0}},
{"12", "%r", MyDateTime{0, 0, 0, 0, 0, 0, 0}},

/// Test cases for %T
{" 4 :13:56 13/05/2019", "%T %d/%c/%Y", MyDateTime{2019, 5, 13, 4, 13, 56, 0}},
{"23: 13:56 13/05/2019", "%T%d/%c/%Y", MyDateTime{2019, 5, 13, 23, 13, 56, 0}},
{"12:13 :56 13/05/2019", "%T %d/%c/%Y", MyDateTime{2019, 5, 13, 12, 13, 56, 0}},
{"19:13: 56 13/05/2019", "%T %d/%c/%Y", MyDateTime{2019, 5, 13, 19, 13, 56, 0}},
{"21:13", "%T", MyDateTime{0, 0, 0, 21, 13, 0, 0}},
{"21:", "%T", MyDateTime{0, 0, 0, 21, 0, 0, 0}},

// mutiple chars between pattern
{"01/Feb/2016 abcdefg 23:45:54", "%d/%b/%Y abcdefg %H:%i:%S", MyDateTime(2016, 2, 1, 23, 45, 54, 0)},
// the number of whitespace between pattern and input doesn't matter
{"01/Feb/2016 abcdefg 23:45: 54", "%d/%b/%Y abcdefg %H :%i:%S", MyDateTime(2016, 2, 1, 23, 45, 54, 0)},
{"01/Feb/ 2016 abc defg 23:45:54", "%d/ %b/%Y abcdefg %H: %i:%S", MyDateTime(2016, 2, 1, 23, 45, 54, 0)},
{"01/Feb /2016 ab cdefg 23: 45:54", "%d /%b/%Y abc defg %H:%i :%S", MyDateTime{2016, 2, 1, 23, 45, 54, 0}},

/// Cases collect from MySQL 8.0 document
{"01,5,2013", "%d,%m,%Y", MyDateTime{2013, 5, 1, 0, 0, 0, 0}},
{"May 1, 2013", "%M %d,%Y", MyDateTime{2013, 5, 1, 0, 0, 0, 0}},
{"a09:30:17", "a%h:%i:%s", MyDateTime{0, 0, 0, 9, 30, 17, 0}},
{"a09:30:17", "%h:%i:%s", std::nullopt},
{"09:30:17a", "%h:%i:%s", MyDateTime{0, 0, 0, 9, 30, 17, 0}},
{"abc", "abc", MyDateTime{0, 0, 0, 0, 0, 0, 0}},
{"9", "%m", MyDateTime{0, 9, 0, 0, 0, 0, 0}},
{"9", "%s", MyDateTime{0, 0, 0, 0, 0, 9, 0}},
// Range checking on the parts of date values is as described in Section 11.2.2, “The DATE, DATETIME, and TIMESTAMP Types”. This means, for example, that “zero” dates or dates with part values of 0 are permitted unless the SQL mode is set to disallow such values.
{"00/00/0000", "%m/%d/%Y", MyDateTime{0, 0, 0, 0, 0, 0, 0}},
{"04/31/2004", "%m/%d/%Y", MyDateTime{2004, 4, 31, 0, 0, 0, 0}},

/// Below cases are ported from TiDB
{"10/28/2011 9:46:29 pm", "%m/%d/%Y %l:%i:%s %p", MyDateTime(2011, 10, 28, 21, 46, 29, 0)},
{"10/28/2011 9:46:29 Pm", "%m/%d/%Y %l:%i:%s %p", MyDateTime(2011, 10, 28, 21, 46, 29, 0)},
{"2011/10/28 9:46:29 am", "%Y/%m/%d %l:%i:%s %p", MyDateTime(2011, 10, 28, 9, 46, 29, 0)},
{"20161122165022", "%Y%m%d%H%i%s", MyDateTime(2016, 11, 22, 16, 50, 22, 0)},
{"2016 11 22 16 50 22", "%Y%m%d%H%i%s", MyDateTime(2016, 11, 22, 16, 50, 22, 0)}, // fail, should ignore sep
{"16-50-22 2016 11 22", "%H-%i-%s%Y%m%d", MyDateTime(2016, 11, 22, 16, 50, 22, 0)}, // fail, should ignore sep
{"16-50 2016 11 22", "%H-%i-%s%Y%m%d", std::nullopt},
{"15-01-2001 1:59:58.999", "%d-%m-%Y %I:%i:%s.%f", MyDateTime(2001, 1, 15, 1, 59, 58, 999000)},
{"15-01-2001 1:59:58.1", "%d-%m-%Y %H:%i:%s.%f", MyDateTime(2001, 1, 15, 1, 59, 58, 100000)},
{"15-01-2001 1:59:58.", "%d-%m-%Y %H:%i:%s.%f", MyDateTime(2001, 1, 15, 1, 59, 58, 0)},
{"15-01-2001 1:9:8.999", "%d-%m-%Y %H:%i:%s.%f", MyDateTime(2001, 1, 15, 1, 9, 8, 999000)},
{"15-01-2001 1:9:8.999", "%d-%m-%Y %H:%i:%S.%f", MyDateTime(2001, 1, 15, 1, 9, 8, 999000)},
{"2003-01-02 10:11:12 PM", "%Y-%m-%d %H:%i:%S %p", std::nullopt}, // should not set %H %p at the same time
{"10:20:10AM", "%H:%i:%S%p", std::nullopt}, // should not set %H %p at the same time
// test %@(skip alpha), %#(skip number), %.(skip punct)
{"2020-10-10ABCD", "%Y-%m-%d%@", MyDateTime(2020, 10, 10, 0, 0, 0, 0)},
{"2020-10-101234", "%Y-%m-%d%#", MyDateTime(2020, 10, 10, 0, 0, 0, 0)},
{"2020-10-10....", "%Y-%m-%d%.", MyDateTime(2020, 10, 10, 0, 0, 0, 0)},
{"2020-10-10.1", "%Y-%m-%d%.%#%@", MyDateTime(2020, 10, 10, 0, 0, 0, 0)},
{"abcd2020-10-10.1", "%@%Y-%m-%d%.%#%@", MyDateTime(2020, 10, 10, 0, 0, 0, 0)},
{"abcd-2020-10-10.1", "%@-%Y-%m-%d%.%#%@", MyDateTime(2020, 10, 10, 0, 0, 0, 0)},
{"2020-10-10", "%Y-%m-%d%@", MyDateTime(2020, 10, 10, 0, 0, 0, 0)},
{"2020-10-10abcde123abcdef", "%Y-%m-%d%@%#", MyDateTime(2020, 10, 10, 0, 0, 0, 0)},
};
auto result_formatter = MyDateTimeFormatter("%Y/%m/%d %T.%f");
size_t idx = 0;
for (const auto & [input, fmt, expected] : cases)
{
MyDateTimeParser parser(fmt);
auto packed = parser.parseAsPackedUInt(input);
if (expected == std::nullopt)
{
MyTimeBase actual_time;
String actual_str;
if (packed)
{
actual_time = MyTimeBase(*packed);
result_formatter.format(actual_time, actual_str);
}
EXPECT_FALSE((bool)packed) //
<< "[case=" << idx << "] "
<< "[fmt=" << fmt << "] [input=" << input << "] [actual=" << actual_str << "]";
}
else
{
MyTimeBase actual_time;
String actual_str, expect_str;
result_formatter.format(*expected, expect_str);
if (packed)
{
actual_time = MyTimeBase(*packed);
result_formatter.format(actual_time, actual_str);
EXPECT_EQ(*packed, expected->toPackedUInt())
<< "[case=" << idx << "] "
<< "[fmt=" << fmt << "] [input=" << input << "] [expect=" << expect_str << "] [actual=" << actual_str << "]";
}
else
{
EXPECT_TRUE((bool)packed) //
<< "[case=" << idx << "] "
<< "[fmt=" << fmt << "] [input=" << input << "] [expect=" << expect_str << "] [actual=<parse fail>]";
}
}
idx++;
}
}
CATCH

} // namespace tests

} // namespace DB
6 changes: 3 additions & 3 deletions dbms/src/Flash/Coprocessor/DAGUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -912,9 +912,9 @@ std::unordered_map<tipb::ScalarFuncSig, String> scalar_func_map({
//{tipb::ScalarFuncSig::TimestampLiteral, "cast"},

//{tipb::ScalarFuncSig::LastDay, "cast"},
//{tipb::ScalarFuncSig::StrToDateDate, "cast"},
//{tipb::ScalarFuncSig::StrToDateDatetime, "cast"},
//{tipb::ScalarFuncSig::StrToDateDuration, "cast"},
{tipb::ScalarFuncSig::StrToDateDate, "strToDateDate"},
{tipb::ScalarFuncSig::StrToDateDatetime, "strToDateDatetime"},
// {tipb::ScalarFuncSig::StrToDateDuration, "cast"},
{tipb::ScalarFuncSig::FromUnixTime1Arg, "fromUnixTime"}, {tipb::ScalarFuncSig::FromUnixTime2Arg, "fromUnixTime"},
{tipb::ScalarFuncSig::ExtractDatetime, "extractMyDateTime"},
//{tipb::ScalarFuncSig::ExtractDuration, "cast"},
Expand Down
Loading