From eb412699b66e3bd222c6421b3cf363fb1af8fbbc Mon Sep 17 00:00:00 2001 From: William Carroll Date: Mon, 28 Feb 2022 18:02:31 -0800 Subject: feat(wpcarro/simple-select): Parse query language 🎉 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Seems to successfully handle inputs like... ``` > (-fname:"William" lname:/C.*l/) OR (fname:"William" -lname:"Carroll") ``` Change-Id: I5277cfbc7d102158eab5e1e71b2d95aaf13508fd Reviewed-on: https://cl.tvl.fyi/c/depot/+/5340 Reviewed-by: wpcarro Autosubmit: wpcarro Tested-by: BuildkiteCI --- users/wpcarro/scratch/simple-select/main.py | 83 +++++++++++++++++++++++++-- users/wpcarro/scratch/simple-select/parser.py | 29 +++++----- 2 files changed, 93 insertions(+), 19 deletions(-) (limited to 'users/wpcarro/scratch') diff --git a/users/wpcarro/scratch/simple-select/main.py b/users/wpcarro/scratch/simple-select/main.py index 0aea8dcffc..3bcf85e5ef 100644 --- a/users/wpcarro/scratch/simple-select/main.py +++ b/users/wpcarro/scratch/simple-select/main.py @@ -1,5 +1,7 @@ import string from scanner import Scanner +from parser import Parser + ################################################################################ # Predicates ################################################################################ @@ -20,6 +22,13 @@ def is_whitespace(c): # Tokenizer ################################################################################ +AND = ("CONJUNCTION", "AND") +OR = ("CONJUNCTION", "OR") +NOT = ("PUNCTUATION", "NOT") +COLON = ("PUNCTUATION", "COLON") +LPAREN = ("PUNCTUATION", "LPAREN") +RPAREN = ("PUNCTUATION", "RPAREN") + def tokenize(x): s = Scanner(x) tokens = scan_tokens(s) @@ -36,8 +45,10 @@ def scan_tokens(s): def scan_token(s): punctuation = { - "-": "NOT", - ":": "COLON", + "-": NOT, + ":": COLON, + "(": LPAREN, + ")": RPAREN, } c = s.peek() if c in punctuation: @@ -71,18 +82,78 @@ def tokenize_regex(s): return ("REGEX", current) def tokenize_identifier(s): - keywords = { + conjunctions = { "AND", "OR", } current = s.advance() while is_alphanumeric(s.peek()): current += s.advance() - if current.upper() in keywords: - return ("KEYWORD", current.upper()) + if current.upper() in conjunctions: + return ("CONJUNCTION", current.upper()) else: return ("IDENTIFIER", current) +################################################################################ +# Parser +################################################################################ + +# EBNF +# Note: we order expression types by ascending levels of precedence. +# +# expression -> conjunction ; +# conjunction -> selection ( ( "AND" | "OR" )? selection )* ; +# selection -> "-"? IDENTIFIER ":" ( REGEX | STRING ) | grouping ; +# grouping -> REGEX | STRING | "(" expression ")" ; + +def parse(x): + tokens = tokenize(x) + p = Parser(tokens) + return expression(p) + +def expression(p): + return conjunction(p) + +def conjunction(p): + lhs = selection(p) + + while not p.exhausted() and p.test(lambda tokens, i: tokens[i] not in {LPAREN, RPAREN}): + conj = p.advance() if p.peek()[0] == "CONJUNCTION" else AND + rhs = selection(p) + lhs = ("CONJUNCTION", conj[1], lhs, rhs) + + if not p.exhausted(): + raise Exception("Encountered more tokens than we can parse: \"{}\"".format(p.tokens[p.i:])) + + return lhs + +def selection(p): + negate = False + if p.peek() == NOT: + negate = True + p.advance() + + if p.peek()[0] != "IDENTIFIER": + return grouping(p) + + ident = p.expect(lambda x: x[0] == "IDENTIFIER") + colon = p.expect(lambda x: x[1] == "COLON") + value = p.expect(lambda x: x[0] in {"REGEX", "STRING"}) + return ("SELECTION", negate, ident[1], value) + +def grouping(p): + if p.peek()[0] == "REGEX": + return p.advance() + + if p.peek()[0] == "STRING": + return p.advance() + + if p.peek() == LPAREN: + p.advance() + expr = expression(p) + p.expect(lambda x: x == RPAREN) + return ("GROUPING", expr) + ################################################################################ # Main ################################################################################ @@ -90,7 +161,7 @@ def tokenize_identifier(s): def main(): while True: x = input("> ") - print(tokenize(x)) + print(parse(x)) if __name__ == "__main__": main() diff --git a/users/wpcarro/scratch/simple-select/parser.py b/users/wpcarro/scratch/simple-select/parser.py index 407bff61c9..d26f970e57 100644 --- a/users/wpcarro/scratch/simple-select/parser.py +++ b/users/wpcarro/scratch/simple-select/parser.py @@ -3,26 +3,29 @@ class Parser(object): self.tokens = tokens self.i = 0 - def prev(self): - return self.tokens[self.i - 1] + def exhausted(self): + return self.i >= len(self.tokens) - def curr(self): - return self.tokens[self.i] + def peek(self, n=0): + return self.tokens[self.i + n] - def consume(self): + def advance(self): if not self.exhausted(): self.i += 1 - return self.prev() + return self.peek(n=-1) def match(self, xs): - if not self.exhausted() and self.curr() in xs: - self.consume() + if self.peek() in xs: + self.advance() return True return False - def expect(self, xs): - if not self.match(xs): - raise Exception("Expected token \"{}\" but received \"{}\"".format(xs, self.curr())) + def test(self, predicate): + return predicate(self.tokens, self.i) - def exhausted(self): - return self.i >= len(self.tokens) + def expect(self, predicate): + if self.exhausted(): + raise Exception("Unexpected EOL") + if predicate(self.peek()): + return self.advance() + raise Exception("Unexpected token: \"{}\"".format(self.peek())) -- cgit 1.4.1