Skip to content

Commit

Permalink
Fix parse date format with multiple consecutive seperators (#1292) (#…
Browse files Browse the repository at this point in the history
…1293)

* fix

* use const reference

Co-authored-by: Yu Lei <[email protected]>
  • Loading branch information
ti-srebot and leiysky authored Dec 17, 2020
1 parent 5cce02e commit a5e2d63
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 11 deletions.
47 changes: 36 additions & 11 deletions dbms/src/Common/MyTime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,27 +48,52 @@ int getFracIndex(const String & format)
return idx;
}

// helper for date part splitting, punctuation characters are valid separators anywhere,
// while space and 'T' are valid separators only between date and time.
bool isValidSeperator(char c, int previous_parts)
{
if (isPunctuation(c))
return true;

return previous_parts == 2 && (c == ' ' || c == 'T');
}

std::vector<String> parseDateFormat(String format)
{
format = Poco::trimInPlace(format);

if (format.size() == 0)
return {};

if (!std::isdigit(format[0]) || !std::isdigit(format[format.size() - 1]))
{
return {};
}

std::vector<String> seps;
seps.reserve(6);
size_t start = 0;
for (size_t i = 0; i < format.size(); i++)
{
if (i == 0 || i + 1 == format.size())
if (isValidSeperator(format[i], seps.size()))
{
if (!std::isdigit(format[i]))
return {};
int previous_parts = seps.size();
seps.push_back(format.substr(start, i - start));
start = i + 1;

for (size_t j = i + 1; j < format.size(); j++)
{
if (!isValidSeperator(format[j], previous_parts))
break;
start++;
i++;
}
continue;
}

if (!std::isdigit(format[i]))
{
if (!std::isdigit(format[i - 1]))
return {};
seps.push_back(format.substr(start, i - start));
start = i + 1;
return {};
}
}
seps.push_back(format.substr(start));
Expand All @@ -87,7 +112,7 @@ std::vector<String> parseDateFormat(String format)
// second link specified that for string literal, "hour values less than than 10, a leading zero is required.".
// ISO-8601: Z|((((?P<tz_sign>[-+])(?P<tz_hour>[0-9]{2})(:(?P<tz_minute>[0-9]{2}){0,1}){0,1})|((?P<tz_minute>[0-9]{2}){0,1}){0,1}))$
// see https://www.cl.cam.ac.uk/~mgk25/iso-time.html
std::tuple<int, String, String, String, String> getTimeZone(String literal)
std::tuple<int, String, String, String, String> getTimeZone(const String & literal)
{
static const std::map<int, std::tuple<int, int>> valid_idx_combinations{
{100, {0, 0}}, // 23:59:59Z
Expand Down Expand Up @@ -209,9 +234,9 @@ std::tuple<std::vector<String>, String, bool, String, String, String, String> sp
if (frac_idx > 0)
{
frac = format.substr(frac_idx + 1);
while (frac_idx > 0 && isPunctuation(format[tz_idx - 1]))
while (frac_idx > 0 && isPunctuation(format[frac_idx - 1]))
{
// in case of multiple separators, e.g. 2020-10--10
// in case of multiple separators, e.g. 2020-10-10 11:00:00..123456
frac_idx--;
}
format = format.substr(0, frac_idx);
Expand Down Expand Up @@ -728,7 +753,7 @@ Field parseMyDateTime(const String & str, int8_t fsp)
bool truncated_or_incorrect = false;

// noAbsorb tests if can absorb FSP or TZ
auto noAbsorb = [](std::vector<String> seps) {
auto noAbsorb = [](const std::vector<String> & seps) {
// if we have more than 5 parts (i.e. 6), the tailing part can't be absorbed
// or if we only have 1 part, but its length is longer than 4, then it is at least YYMMD, in this case, FSP can
// not be absorbed, and it will be handled later, and the leading sign prevents TZ from being absorbed, because
Expand Down
4 changes: 4 additions & 0 deletions dbms/src/Common/MyTime.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,5 +126,9 @@ size_t maxFormattedDateTimeStringLength(const String & format);

MyDateTime numberToDateTime(Int64 number);

bool isPunctuation(char c);

bool isValidSeperator(char c, int previous_parts);


} // namespace DB
1 change: 1 addition & 0 deletions dbms/src/Common/tests/gtest_mytime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ try
{"2020-10-10 10-10.10", "2020-10-10 10:10:10.000000"},
{"2020-10-10 10.10", "2020-10-10 10:10:00.000000"},
{"2018.01.01", "2018-01-01 00:00:00.000000"},
{"2020--12-10 11:11:11..123456", "2020-12-10 11:11:11.123456"},
};
DataTypeMyDateTime type_with_fraction(6);
for (auto & [str, expected] : cases_with_fsp)
Expand Down

0 comments on commit a5e2d63

Please sign in to comment.