From eb412699b66e3bd222c6421b3cf363fb1af8fbbc Mon Sep 17 00:00:00 2001
From: William Carroll <wpcarro@gmail.com>
Date: Mon, 28 Feb 2022 18:02:31 -0800
Subject: feat(wpcarro/simple-select): Parse query language 🎉
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Seems to successfully handle inputs like...

```
> (-fname:"William" lname:/C.*l/) OR (fname:"William" -lname:"Carroll")
```

Change-Id: I5277cfbc7d102158eab5e1e71b2d95aaf13508fd
Reviewed-on: https://cl.tvl.fyi/c/depot/+/5340
Reviewed-by: wpcarro <wpcarro@gmail.com>
Autosubmit: wpcarro <wpcarro@gmail.com>
Tested-by: BuildkiteCI
---
 users/wpcarro/scratch/simple-select/main.py   | 83 +++++++++++++++++++++++++--
 users/wpcarro/scratch/simple-select/parser.py | 29 +++++-----
 2 files changed, 93 insertions(+), 19 deletions(-)

(limited to 'users/wpcarro/scratch')

diff --git a/users/wpcarro/scratch/simple-select/main.py b/users/wpcarro/scratch/simple-select/main.py
index 0aea8dcffc..3bcf85e5ef 100644
--- a/users/wpcarro/scratch/simple-select/main.py
+++ b/users/wpcarro/scratch/simple-select/main.py
@@ -1,5 +1,7 @@
 import string
 from scanner import Scanner
+from parser import Parser
+
 ################################################################################
 # Predicates
 ################################################################################
@@ -20,6 +22,13 @@ def is_whitespace(c):
 # Tokenizer
 ################################################################################
 
+AND    = ("CONJUNCTION", "AND")
+OR     = ("CONJUNCTION", "OR")
+NOT    = ("PUNCTUATION", "NOT")
+COLON  = ("PUNCTUATION", "COLON")
+LPAREN = ("PUNCTUATION", "LPAREN")
+RPAREN = ("PUNCTUATION", "RPAREN")
+
 def tokenize(x):
   s = Scanner(x)
   tokens = scan_tokens(s)
@@ -36,8 +45,10 @@ def scan_tokens(s):
 
 def scan_token(s):
   punctuation = {
-      "-": "NOT",
-      ":": "COLON",
+      "-": NOT,
+      ":": COLON,
+      "(": LPAREN,
+      ")": RPAREN,
   }
   c = s.peek()
   if c in punctuation:
@@ -71,18 +82,78 @@ def tokenize_regex(s):
   return ("REGEX", current)
 
 def tokenize_identifier(s):
-  keywords = {
+  conjunctions = {
       "AND",
       "OR",
   }
   current = s.advance()
   while is_alphanumeric(s.peek()):
     current += s.advance()
-  if current.upper() in keywords:
-    return ("KEYWORD", current.upper())
+  if current.upper() in conjunctions:
+    return ("CONJUNCTION", current.upper())
   else:
     return ("IDENTIFIER", current)
 
+################################################################################
+# Parser
+################################################################################
+
+# EBNF
+# Note: we order expression types by ascending levels of precedence.
+#
+# expression  -> conjunction ;
+# conjunction -> selection ( ( "AND" | "OR" )? selection )* ;
+# selection   -> "-"? IDENTIFIER ":" ( REGEX | STRING ) | grouping ;
+# grouping    -> REGEX | STRING | "(" expression ")" ;
+
+def parse(x):
+  tokens = tokenize(x)
+  p = Parser(tokens)
+  return expression(p)
+
+def expression(p):
+  return conjunction(p)
+
+def conjunction(p):
+  lhs = selection(p)
+
+  while not p.exhausted() and p.test(lambda tokens, i: tokens[i] not in {LPAREN, RPAREN}):
+    conj = p.advance() if p.peek()[0] == "CONJUNCTION" else AND
+    rhs = selection(p)
+    lhs = ("CONJUNCTION", conj[1], lhs, rhs)
+
+  if not p.exhausted():
+    raise Exception("Encountered more tokens than we can parse: \"{}\"".format(p.tokens[p.i:]))
+
+  return lhs
+
+def selection(p):
+  negate = False
+  if p.peek() == NOT:
+    negate = True
+    p.advance()
+
+  if p.peek()[0] != "IDENTIFIER":
+    return grouping(p)
+
+  ident = p.expect(lambda x: x[0] == "IDENTIFIER")
+  colon = p.expect(lambda x: x[1] == "COLON")
+  value = p.expect(lambda x: x[0] in {"REGEX", "STRING"})
+  return ("SELECTION", negate, ident[1], value)
+
+def grouping(p):
+  if p.peek()[0] == "REGEX":
+    return p.advance()
+
+  if p.peek()[0] == "STRING":
+    return p.advance()
+
+  if p.peek() == LPAREN:
+    p.advance()
+    expr = expression(p)
+    p.expect(lambda x: x == RPAREN)
+    return ("GROUPING", expr)
+
 ################################################################################
 # Main
 ################################################################################
@@ -90,7 +161,7 @@ def tokenize_identifier(s):
 def main():
   while True:
     x = input("> ")
-    print(tokenize(x))
+    print(parse(x))
 
 if __name__ == "__main__":
   main()
diff --git a/users/wpcarro/scratch/simple-select/parser.py b/users/wpcarro/scratch/simple-select/parser.py
index 407bff61c9..d26f970e57 100644
--- a/users/wpcarro/scratch/simple-select/parser.py
+++ b/users/wpcarro/scratch/simple-select/parser.py
@@ -3,26 +3,29 @@ class Parser(object):
         self.tokens = tokens
         self.i = 0
 
-    def prev(self):
-        return self.tokens[self.i - 1]
+    def exhausted(self):
+        return self.i >= len(self.tokens)
 
-    def curr(self):
-        return self.tokens[self.i]
+    def peek(self, n=0):
+        return self.tokens[self.i + n]
 
-    def consume(self):
+    def advance(self):
         if not self.exhausted():
             self.i += 1
-            return self.prev()
+        return self.peek(n=-1)
 
     def match(self, xs):
-        if not self.exhausted() and self.curr() in xs:
-            self.consume()
+        if self.peek() in xs:
+            self.advance()
             return True
         return False
 
-    def expect(self, xs):
-        if not self.match(xs):
-            raise Exception("Expected token \"{}\" but received \"{}\"".format(xs, self.curr()))
+    def test(self, predicate):
+        return predicate(self.tokens, self.i)
 
-    def exhausted(self):
-        return self.i >= len(self.tokens)
+    def expect(self, predicate):
+        if self.exhausted():
+            raise Exception("Unexpected EOL")
+        if predicate(self.peek()):
+            return self.advance()
+        raise Exception("Unexpected token: \"{}\"".format(self.peek()))
-- 
cgit 1.4.1