From f08d34f3c613b213f97d0953c565cff67ba69623 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Fri, 10 Apr 2020 19:37:42 +0400 Subject: [PATCH] Review IDENT token parsing, to allow any characters in backtick-wrapped identifiers, and allow mixing backtick-wrapped identifiers and simple ones when used with connecting dot Handle dot in a generic way when parsing IDENT tokens Add positive and negative tests for parsing changes Add a test for FLOOR translating --- driver/escaping/lexer.cpp | 60 ++++++++++++++++--------- driver/test/escape_sequences_ut.cpp | 69 +++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 21 deletions(-) diff --git a/driver/escaping/lexer.cpp b/driver/escaping/lexer.cpp index b4bd914aa..eca430025 100644 --- a/driver/escaping/lexer.cpp +++ b/driver/escaping/lexer.cpp @@ -164,41 +164,59 @@ Token Lexer::NextToken() { has_slash = false; } - return Token {Token::INVALID, StringView(st, cur_ - st)}; + return Token {Token::INVALID, StringView(st, cur_)}; } default: { const char * st = cur_; - if (*cur_ == '`') { - bool inside_quotes = true; - for (++cur_; cur_ < end_; ++cur_) { + if (isalpha(*cur_) || *cur_ == '_' || *cur_ == '`') { + bool has_dot = false; + bool has_backtick = false; + + while (cur_ < end_) { if (*cur_ == '`') { - inside_quotes = !inside_quotes; - if (cur_ < end_ && *(cur_ + 1) == '.') { - ++cur_; - continue; - } else if (!inside_quotes) - return Token {Token::IDENT, StringView(st, ++cur_)}; - if (cur_ < end_) - ++cur_; + has_backtick = true; + bool found_closing_backtick = false; + + for (++cur_; cur_ < end_; ++cur_) { + if (*cur_ == '`') { + found_closing_backtick = true; + ++cur_; + break; + } + } + + if (!found_closing_backtick) { + return Token {Token::INVALID, StringView(st, cur_)}; + } + } + else if (isalpha(*cur_) || *cur_ == '_') { + for (++cur_; cur_ < end_; ++cur_) { + if (!isalpha(*cur_) && !isdigit(*cur_) && *cur_ != '_') { + break; + } + } } - if (!isalpha(*cur_) && !isdigit(*cur_) && *cur_ != '_' && *cur_ != '.') { + else { return Token {Token::INVALID, StringView(st, cur_)}; } - } - - break; - } - if (isalpha(*cur_) || *cur_ == '_') { - for (++cur_; cur_ < end_; ++cur_) { - if (!isalpha(*cur_) && !isdigit(*cur_) && *cur_ != '_' && *cur_ != '.') { + if (cur_ < end_ && *cur_ == '.') { + has_dot = true; + ++cur_; + } + else { break; } } - return Token {LookupIdent(to_upper(StringView(st, cur_))), StringView(st, cur_)}; + if (has_dot || has_backtick) { + return Token {Token::IDENT, StringView(st, cur_)}; + } + else { + return Token {LookupIdent(to_upper(StringView(st, cur_))), StringView(st, cur_)}; + } } if (isdigit(*cur_) || *cur_ == '.' || *cur_ == '-') { diff --git a/driver/test/escape_sequences_ut.cpp b/driver/test/escape_sequences_ut.cpp index 09d9137a9..59616f7a4 100644 --- a/driver/test/escape_sequences_ut.cpp +++ b/driver/test/escape_sequences_ut.cpp @@ -2,6 +2,71 @@ #include +TEST(EscapeSequencesCase, ParseIdent1) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(abc, SQL_BIGINT)})"), + "SELECT SUM(toInt64(abc))"); +} + +TEST(EscapeSequencesCase, ParseIdent2) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(`abc`, SQL_BIGINT)})"), + "SELECT SUM(toInt64(`abc`))"); +} + +TEST(EscapeSequencesCase, ParseIdent3) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(`0 a b $ c`, SQL_BIGINT)})"), + "SELECT SUM(toInt64(`0 a b $ c`))"); +} + +TEST(EscapeSequencesCase, ParseIdent4) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(abc.`0 a b $ c`, SQL_BIGINT)})"), + "SELECT SUM(toInt64(abc.`0 a b $ c`))"); +} + +TEST(EscapeSequencesCase, ParseIdent5) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(`0 a b $ c`.abc, SQL_BIGINT)})"), + "SELECT SUM(toInt64(`0 a b $ c`.abc))"); +} + +TEST(EscapeSequencesCase, ParseIdent6) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(abc.`0 a b $ c`.abc, SQL_BIGINT)})"), + "SELECT SUM(toInt64(abc.`0 a b $ c`.abc))"); +} + +TEST(EscapeSequencesCase, ParseIdent7) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(`0 a b $ c`.abc.`0 a b $ c`, SQL_BIGINT)})"), + "SELECT SUM(toInt64(`0 a b $ c`.abc.`0 a b $ c`))"); +} + +TEST(EscapeSequencesCase, ParseIdentX1) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(0 a b $ c, SQL_BIGINT)})"), + "SELECT SUM({fn CONVERT(0 a b $ c, SQL_BIGINT)})"); +} + +TEST(EscapeSequencesCase, ParseIdentX2) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(.abc, SQL_BIGINT)})"), + "SELECT SUM({fn CONVERT(.abc, SQL_BIGINT)})"); +} + +TEST(EscapeSequencesCase, ParseIdentX3) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(.`abc`, SQL_BIGINT)})"), + "SELECT SUM({fn CONVERT(.`abc`, SQL_BIGINT)})"); +} + +TEST(EscapeSequencesCase, ParseIdentX4) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(abc., SQL_BIGINT)})"), + "SELECT SUM({fn CONVERT(abc., SQL_BIGINT)})"); +} + +TEST(EscapeSequencesCase, ParseIdentX5) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(`abc`., SQL_BIGINT)})"), + "SELECT SUM({fn CONVERT(`abc`., SQL_BIGINT)})"); +} + +TEST(EscapeSequencesCase, ParseIdentX6) { + ASSERT_EQ(replaceEscapeSequences("SELECT SUM({fn CONVERT(abc..abc, SQL_BIGINT)})"), + "SELECT SUM({fn CONVERT(abc..abc, SQL_BIGINT)})"); +} + TEST(EscapeSequencesCase, ParseConvert1) { ASSERT_EQ(replaceEscapeSequences("SELECT {fn CONVERT(1, SQL_BIGINT)}"), "SELECT toInt64(1)"); } @@ -52,6 +117,10 @@ TEST(EscapeSequencesCase, ParseRound) { ASSERT_EQ(replaceEscapeSequences("SELECT {fn ROUND(1.1 + 2.4, 1)}"), "SELECT round(1.1 + 2.4, 1)"); } +TEST(EscapeSequencesCase, ParseFloor) { + ASSERT_EQ(replaceEscapeSequences("SELECT {fn FLOOR(1.1 + 2.4, 1)}"), "SELECT floor(1.1 + 2.4, 1)"); +} + TEST(EscapeSequencesCase, ParsePower) { ASSERT_EQ(replaceEscapeSequences("SELECT {fn POWER(`f_g38d`.`hsf_thkd_wect_fxge`,2)}"), "SELECT pow(`f_g38d`.`hsf_thkd_wect_fxge`,2)"); }