Skip to content

Commit

Permalink
add regular chars
Browse files Browse the repository at this point in the history
  • Loading branch information
rui-mo committed Aug 9, 2023
1 parent d26b6de commit e404d87
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 16 deletions.
6 changes: 4 additions & 2 deletions velox/type/Subfield.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@

namespace facebook::velox::common {

Subfield::Subfield(const std::string& path) {
Tokenizer tokenizer(path);
Subfield::Subfield(
const std::string& path,
const std::optional<std::vector<char>>& separators) {
Tokenizer tokenizer(path, separators);
VELOX_CHECK(tokenizer.hasNext(), "Column name is missing: {}", path);

auto firstElement = tokenizer.next();
Expand Down
5 changes: 4 additions & 1 deletion velox/type/Subfield.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,10 @@ class Subfield {
};

public:
explicit Subfield(const std::string& path);
// Separators: the customized separators to tokenize field name.
explicit Subfield(
const std::string& path,
const std::optional<std::vector<char>>& separators = std::nullopt);

explicit Subfield(std::vector<std::unique_ptr<PathElement>>&& path);

Expand Down
32 changes: 24 additions & 8 deletions velox/type/Tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,15 @@

namespace facebook::velox::common {

Tokenizer::Tokenizer(const std::string& path) : path_(path) {
Tokenizer::Tokenizer(
const std::string& path,
const std::optional<std::vector<char>>& separators)
: path_(path) {
state = State::kNotReady;
index_ = 0;
if (separators.has_value()) {
separators_ = separators.value();
}
}

bool Tokenizer::hasNext() {
Expand Down Expand Up @@ -54,17 +60,17 @@ std::unique_ptr<Subfield::PathElement> Tokenizer::computeNext() {
return nullptr;
}

if (tryMatch(DOT)) {
if (tryMatchSeparator(DOT)) {
std::unique_ptr<Subfield::PathElement> token = matchPathSegment();
firstSegment = false;
return token;
}

if (tryMatch(OPEN_BRACKET)) {
std::unique_ptr<Subfield::PathElement> token = tryMatch(QUOTE)
if (tryMatchSeparator(OPEN_BRACKET)) {
std::unique_ptr<Subfield::PathElement> token = tryMatchSeparator(QUOTE)
? matchQuotedSubscript()
: tryMatch(WILDCARD) ? matchWildcardSubscript()
: matchUnquotedSubscript();
: tryMatchSeparator(WILDCARD) ? matchWildcardSubscript()
: matchUnquotedSubscript();

match(CLOSE_BRACKET);
firstSegment = false;
Expand All @@ -80,6 +86,10 @@ std::unique_ptr<Subfield::PathElement> Tokenizer::computeNext() {
VELOX_UNREACHABLE();
}

bool Tokenizer::tryMatchSeparator(char expected) {
return isSeparator(expected) && tryMatch(expected);
}

void Tokenizer::match(char expected) {
if (!tryMatch(expected)) {
invalidSubfieldPath();
Expand All @@ -105,7 +115,8 @@ char Tokenizer::peekCharacter() {
std::unique_ptr<Subfield::PathElement> Tokenizer::matchPathSegment() {
// seek until we see a special character or whitespace
int start = index_;
while (hasNextCharacter() && isUnquotedPathCharacter(peekCharacter())) {
while (hasNextCharacter() && !isSeparator(peekCharacter()) &&
isUnquotedPathCharacter(peekCharacter())) {
nextCharacter();
}
int end = index_;
Expand Down Expand Up @@ -143,9 +154,14 @@ std::unique_ptr<Subfield::PathElement> Tokenizer::matchUnquotedSubscript() {
return std::make_unique<Subfield::LongSubscript>(index);
}

bool Tokenizer::isSeparator(char c) {
return std::find(separators_.begin(), separators_.end(), c) !=
separators_.end();
}

bool Tokenizer::isUnquotedPathCharacter(char c) {
return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' ||
c == '#' || isUnquotedSubscriptCharacter(c);
c == '#' || c == '.' || isUnquotedSubscriptCharacter(c);
}

bool Tokenizer::isUnquotedSubscriptCharacter(char c) {
Expand Down
15 changes: 14 additions & 1 deletion velox/type/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ class Tokenizer {
kFailed,
};

explicit Tokenizer(const std::string& path);
// Separators: the customized separators to tokenize field name.
explicit Tokenizer(
const std::string& path,
const std::optional<std::vector<char>>& separators);

bool hasNext();

Expand All @@ -51,6 +54,9 @@ class Tokenizer {
const char UNICODE_CARET = '^';

const std::string path_;
// Field name separators by default.
std::vector<char> separators_ =
{DOT, OPEN_BRACKET, QUOTE, WILDCARD, CLOSE_BRACKET};
int index_;
State state;
bool firstSegment = true;
Expand All @@ -60,6 +66,10 @@ class Tokenizer {

std::unique_ptr<Subfield::PathElement> computeNext();

// Returns whether the expected char is a separator and
// can be found.
bool tryMatchSeparator(char expected);

void match(char expected);

bool tryMatch(char expected);
Expand All @@ -74,6 +84,9 @@ class Tokenizer {

bool tryToComputeNext();

// Returns whether the char is a field name separator.
bool isSeparator(char c);

void invalidSubfieldPath();

bool isUnquotedPathCharacter(char c);
Expand Down
13 changes: 9 additions & 4 deletions velox/type/tests/SubfieldTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
using namespace facebook::velox::common;

std::vector<std::unique_ptr<Subfield::PathElement>> tokenize(
const std::string& path) {
const std::string& path,
const std::optional<std::vector<char>>& separators = std::nullopt) {
std::vector<std::unique_ptr<Subfield::PathElement>> elements;
Tokenizer tokenizer(path);
Tokenizer tokenizer(path, separators);
while (tokenizer.hasNext()) {
elements.push_back(tokenizer.next());
}
Expand All @@ -47,8 +48,10 @@ TEST(SubfieldTest, invalidPaths) {
assertInvalidSubfield("a[2].[3].", "Invalid subfield path: a[2].^[3].");
}

void testColumnName(const std::string& name) {
auto elements = tokenize(name);
void testColumnName(
const std::string& name,
const std::optional<std::vector<char>>& separators = std::nullopt) {
auto elements = tokenize(name, separators);
EXPECT_EQ(elements.size(), 1);
EXPECT_EQ(*elements[0].get(), Subfield::NestedField(name));
}
Expand All @@ -59,6 +62,8 @@ TEST(SubfieldTest, columnNamesWithSpecialCharacters) {
testColumnName("a/b/c:12");
testColumnName("@basis");
testColumnName("@basis|city_id");
std::vector<char> separators = {'[', ']', '\"', '*'};
testColumnName("city.id@address:number/date|day$a-b$10_bucket", separators);
}

std::vector<std::unique_ptr<Subfield::PathElement>> createElements() {
Expand Down

0 comments on commit e404d87

Please sign in to comment.