Skip to content

Commit

Permalink
add regular chars
Browse files Browse the repository at this point in the history
  • Loading branch information
rui-mo committed Aug 23, 2023
1 parent 918156e commit fafcbe0
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 23 deletions.
6 changes: 4 additions & 2 deletions velox/type/Subfield.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@

namespace facebook::velox::common {

Subfield::Subfield(const std::string& path) {
Tokenizer tokenizer(path);
Subfield::Subfield(
const std::string& path,
const std::shared_ptr<Separators>& separators) {
Tokenizer tokenizer(path, separators);
VELOX_CHECK(tokenizer.hasNext(), "Column name is missing: {}", path);

auto firstElement = tokenizer.next();
Expand Down
20 changes: 19 additions & 1 deletion velox/type/Subfield.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,20 @@ enum SubfieldKind {
kLongSubscript
};

struct Separators {
bool isSeparator(char c) const {
return (
c == dot || c == openBracket || c == quote || c == wildCard ||
c == closeBracket);
}

char dot = Tokenizer::DOT;
char openBracket = Tokenizer::OPEN_BRACKET;
char quote = Tokenizer::QUOTE;
char wildCard = Tokenizer::WILDCARD;
char closeBracket = Tokenizer::CLOSE_BRACKET;
};

class Subfield {
public:
class PathElement {
Expand Down Expand Up @@ -193,7 +207,11 @@ class Subfield {
};

public:
explicit Subfield(const std::string& path);
// Separators: the customized separators to tokenize field name.
explicit Subfield(
const std::string& path,
const std::shared_ptr<Separators>& separators =
std::make_shared<Separators>());

explicit Subfield(std::vector<std::unique_ptr<PathElement>>&& path);

Expand Down
24 changes: 16 additions & 8 deletions velox/type/Tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@

namespace facebook::velox::common {

Tokenizer::Tokenizer(const std::string& path) : path_(path) {
Tokenizer::Tokenizer(
const std::string& path,
const std::shared_ptr<Separators>& separators)
: path_(path), separators_(separators) {
state = State::kNotReady;
index_ = 0;
}
Expand Down Expand Up @@ -54,17 +57,17 @@ std::unique_ptr<Subfield::PathElement> Tokenizer::computeNext() {
return nullptr;
}

if (tryMatch(DOT)) {
if (tryMatchSeparator(DOT)) {
std::unique_ptr<Subfield::PathElement> token = matchPathSegment();
firstSegment = false;
return token;
}

if (tryMatch(OPEN_BRACKET)) {
std::unique_ptr<Subfield::PathElement> token = tryMatch(QUOTE)
if (tryMatchSeparator(OPEN_BRACKET)) {
std::unique_ptr<Subfield::PathElement> token = tryMatchSeparator(QUOTE)
? matchQuotedSubscript()
: tryMatch(WILDCARD) ? matchWildcardSubscript()
: matchUnquotedSubscript();
: tryMatchSeparator(WILDCARD) ? matchWildcardSubscript()
: matchUnquotedSubscript();

match(CLOSE_BRACKET);
firstSegment = false;
Expand All @@ -80,6 +83,10 @@ std::unique_ptr<Subfield::PathElement> Tokenizer::computeNext() {
VELOX_UNREACHABLE();
}

bool Tokenizer::tryMatchSeparator(char expected) {
return separators_->isSeparator(expected) && tryMatch(expected);
}

void Tokenizer::match(char expected) {
if (!tryMatch(expected)) {
invalidSubfieldPath();
Expand All @@ -105,7 +112,8 @@ char Tokenizer::peekCharacter() {
std::unique_ptr<Subfield::PathElement> Tokenizer::matchPathSegment() {
// seek until we see a special character or whitespace
int start = index_;
while (hasNextCharacter() && isUnquotedPathCharacter(peekCharacter())) {
while (hasNextCharacter() && !separators_->isSeparator(peekCharacter()) &&
isUnquotedPathCharacter(peekCharacter())) {
nextCharacter();
}
int end = index_;
Expand Down Expand Up @@ -145,7 +153,7 @@ std::unique_ptr<Subfield::PathElement> Tokenizer::matchUnquotedSubscript() {

bool Tokenizer::isUnquotedPathCharacter(char c) {
return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' ||
c == '#' || isUnquotedSubscriptCharacter(c);
c == '#' || c == '.' || isUnquotedSubscriptCharacter(c);
}

bool Tokenizer::isUnquotedSubscriptCharacter(char c) {
Expand Down
28 changes: 20 additions & 8 deletions velox/type/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,27 @@ class Tokenizer {
kFailed,
};

explicit Tokenizer(const std::string& path);
// Separators: the customized separators to tokenize field name.
explicit Tokenizer(
const std::string& path,
const std::shared_ptr<Separators>& separators);

bool hasNext();

std::unique_ptr<Subfield::PathElement> next();

private:
const char DOT = '.';
const char QUOTE = '\"';
const char BACKSLASH = '\\';
const char WILDCARD = '*';
const char OPEN_BRACKET = '[';
const char CLOSE_BRACKET = ']';
const char UNICODE_CARET = '^';
static const char DOT = '.';
static const char QUOTE = '\"';
static const char BACKSLASH = '\\';
static const char WILDCARD = '*';
static const char OPEN_BRACKET = '[';
static const char CLOSE_BRACKET = ']';
static const char UNICODE_CARET = '^';

const std::string path_;
std::shared_ptr<Separators> separators_;

int index_;
State state;
bool firstSegment = true;
Expand All @@ -60,6 +65,10 @@ class Tokenizer {

std::unique_ptr<Subfield::PathElement> computeNext();

// Returns whether the expected char is a separator and
// can be found.
bool tryMatchSeparator(char expected);

void match(char expected);

bool tryMatch(char expected);
Expand All @@ -74,6 +83,9 @@ class Tokenizer {

bool tryToComputeNext();

// Returns whether the char is a field name separator.
bool isSeparator(char c);

void invalidSubfieldPath();

bool isUnquotedPathCharacter(char c);
Expand Down
14 changes: 10 additions & 4 deletions velox/type/tests/SubfieldTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
using namespace facebook::velox::common;

std::vector<std::unique_ptr<Subfield::PathElement>> tokenize(
const std::string& path) {
const std::string& path,
const std::shared_ptr<Separators>& separators = std::make_shared<Separators>()) {
std::vector<std::unique_ptr<Subfield::PathElement>> elements;
Tokenizer tokenizer(path);
Tokenizer tokenizer(path, separators);
while (tokenizer.hasNext()) {
elements.push_back(tokenizer.next());
}
Expand All @@ -47,8 +48,10 @@ TEST(SubfieldTest, invalidPaths) {
assertInvalidSubfield("a[2].[3].", "Invalid subfield path: a[2].^[3].");
}

void testColumnName(const std::string& name) {
auto elements = tokenize(name);
void testColumnName(
const std::string& name,
const std::shared_ptr<Separators>& separators = std::make_shared<Separators>()) {
auto elements = tokenize(name, separators);
EXPECT_EQ(elements.size(), 1);
EXPECT_EQ(*elements[0].get(), Subfield::NestedField(name));
}
Expand All @@ -59,6 +62,9 @@ TEST(SubfieldTest, columnNamesWithSpecialCharacters) {
testColumnName("a/b/c:12");
testColumnName("@basis");
testColumnName("@basis|city_id");
auto separators = std::make_shared<Separator>();
separators->dot = '\0';
testColumnName("city.id@address:number/date|day$a-b$10_bucket", separators);
}

std::vector<std::unique_ptr<Subfield::PathElement>> createElements() {
Expand Down

0 comments on commit fafcbe0

Please sign in to comment.