diff options
author | William Carroll <wpcarro@gmail.com> | 2022-03-01T02·02-0800 |
---|---|---|
committer | clbot <clbot@tvl.fyi> | 2022-03-01T02·07+0000 |
commit | eb412699b66e3bd222c6421b3cf363fb1af8fbbc (patch) | |
tree | 99919e9a81f6682c5583e6f4013b9de76796b9b1 | |
parent | b5f78e433c80f4fc29550c6278bf9ef7a4a7dbe2 (diff) |
feat(wpcarro/simple-select): Parse query language 🎉 r/3882
Seems to successfully handle inputs like... ``` > (-fname:"William" lname:/C.*l/) OR (fname:"William" -lname:"Carroll") ``` Change-Id: I5277cfbc7d102158eab5e1e71b2d95aaf13508fd Reviewed-on: https://cl.tvl.fyi/c/depot/+/5340 Reviewed-by: wpcarro <wpcarro@gmail.com> Autosubmit: wpcarro <wpcarro@gmail.com> Tested-by: BuildkiteCI
-rw-r--r-- | users/wpcarro/scratch/simple-select/main.py | 83 | ||||
-rw-r--r-- | users/wpcarro/scratch/simple-select/parser.py | 29 |
2 files changed, 93 insertions, 19 deletions
diff --git a/users/wpcarro/scratch/simple-select/main.py b/users/wpcarro/scratch/simple-select/main.py index 0aea8dcffc5e..3bcf85e5efdf 100644 --- a/users/wpcarro/scratch/simple-select/main.py +++ b/users/wpcarro/scratch/simple-select/main.py @@ -1,5 +1,7 @@ import string from scanner import Scanner +from parser import Parser + ################################################################################ # Predicates ################################################################################ @@ -20,6 +22,13 @@ def is_whitespace(c): # Tokenizer ################################################################################ +AND = ("CONJUNCTION", "AND") +OR = ("CONJUNCTION", "OR") +NOT = ("PUNCTUATION", "NOT") +COLON = ("PUNCTUATION", "COLON") +LPAREN = ("PUNCTUATION", "LPAREN") +RPAREN = ("PUNCTUATION", "RPAREN") + def tokenize(x): s = Scanner(x) tokens = scan_tokens(s) @@ -36,8 +45,10 @@ def scan_tokens(s): def scan_token(s): punctuation = { - "-": "NOT", - ":": "COLON", + "-": NOT, + ":": COLON, + "(": LPAREN, + ")": RPAREN, } c = s.peek() if c in punctuation: @@ -71,26 +82,86 @@ def tokenize_regex(s): return ("REGEX", current) def tokenize_identifier(s): - keywords = { + conjunctions = { "AND", "OR", } current = s.advance() while is_alphanumeric(s.peek()): current += s.advance() - if current.upper() in keywords: - return ("KEYWORD", current.upper()) + if current.upper() in conjunctions: + return ("CONJUNCTION", current.upper()) else: return ("IDENTIFIER", current) ################################################################################ +# Parser +################################################################################ + +# EBNF +# Note: we order expression types by ascending levels of precedence. +# +# expression -> conjunction ; +# conjunction -> selection ( ( "AND" | "OR" )? selection )* ; +# selection -> "-"? IDENTIFIER ":" ( REGEX | STRING ) | grouping ; +# grouping -> REGEX | STRING | "(" expression ")" ; + +def parse(x): + tokens = tokenize(x) + p = Parser(tokens) + return expression(p) + +def expression(p): + return conjunction(p) + +def conjunction(p): + lhs = selection(p) + + while not p.exhausted() and p.test(lambda tokens, i: tokens[i] not in {LPAREN, RPAREN}): + conj = p.advance() if p.peek()[0] == "CONJUNCTION" else AND + rhs = selection(p) + lhs = ("CONJUNCTION", conj[1], lhs, rhs) + + if not p.exhausted(): + raise Exception("Encountered more tokens than we can parse: \"{}\"".format(p.tokens[p.i:])) + + return lhs + +def selection(p): + negate = False + if p.peek() == NOT: + negate = True + p.advance() + + if p.peek()[0] != "IDENTIFIER": + return grouping(p) + + ident = p.expect(lambda x: x[0] == "IDENTIFIER") + colon = p.expect(lambda x: x[1] == "COLON") + value = p.expect(lambda x: x[0] in {"REGEX", "STRING"}) + return ("SELECTION", negate, ident[1], value) + +def grouping(p): + if p.peek()[0] == "REGEX": + return p.advance() + + if p.peek()[0] == "STRING": + return p.advance() + + if p.peek() == LPAREN: + p.advance() + expr = expression(p) + p.expect(lambda x: x == RPAREN) + return ("GROUPING", expr) + +################################################################################ # Main ################################################################################ def main(): while True: x = input("> ") - print(tokenize(x)) + print(parse(x)) if __name__ == "__main__": main() diff --git a/users/wpcarro/scratch/simple-select/parser.py b/users/wpcarro/scratch/simple-select/parser.py index 407bff61c980..d26f970e57a2 100644 --- a/users/wpcarro/scratch/simple-select/parser.py +++ b/users/wpcarro/scratch/simple-select/parser.py @@ -3,26 +3,29 @@ class Parser(object): self.tokens = tokens self.i = 0 - def prev(self): - return self.tokens[self.i - 1] + def exhausted(self): + return self.i >= len(self.tokens) - def curr(self): - return self.tokens[self.i] + def peek(self, n=0): + return self.tokens[self.i + n] - def consume(self): + def advance(self): if not self.exhausted(): self.i += 1 - return self.prev() + return self.peek(n=-1) def match(self, xs): - if not self.exhausted() and self.curr() in xs: - self.consume() + if self.peek() in xs: + self.advance() return True return False - def expect(self, xs): - if not self.match(xs): - raise Exception("Expected token \"{}\" but received \"{}\"".format(xs, self.curr())) + def test(self, predicate): + return predicate(self.tokens, self.i) - def exhausted(self): - return self.i >= len(self.tokens) + def expect(self, predicate): + if self.exhausted(): + raise Exception("Unexpected EOL") + if predicate(self.peek()): + return self.advance() + raise Exception("Unexpected token: \"{}\"".format(self.peek())) |