about summary refs log tree commit diff
path: root/users/wpcarro/scratch/simple-select/main.py
diff options
context:
space:
mode:
Diffstat (limited to 'users/wpcarro/scratch/simple-select/main.py')
-rw-r--r--users/wpcarro/scratch/simple-select/main.py83
1 files changed, 77 insertions, 6 deletions
diff --git a/users/wpcarro/scratch/simple-select/main.py b/users/wpcarro/scratch/simple-select/main.py
index 0aea8dcffc5e..3bcf85e5efdf 100644
--- a/users/wpcarro/scratch/simple-select/main.py
+++ b/users/wpcarro/scratch/simple-select/main.py
@@ -1,5 +1,7 @@
 import string
 from scanner import Scanner
+from parser import Parser
+
 ################################################################################
 # Predicates
 ################################################################################
@@ -20,6 +22,13 @@ def is_whitespace(c):
 # Tokenizer
 ################################################################################
 
+AND    = ("CONJUNCTION", "AND")
+OR     = ("CONJUNCTION", "OR")
+NOT    = ("PUNCTUATION", "NOT")
+COLON  = ("PUNCTUATION", "COLON")
+LPAREN = ("PUNCTUATION", "LPAREN")
+RPAREN = ("PUNCTUATION", "RPAREN")
+
 def tokenize(x):
   s = Scanner(x)
   tokens = scan_tokens(s)
@@ -36,8 +45,10 @@ def scan_tokens(s):
 
 def scan_token(s):
   punctuation = {
-      "-": "NOT",
-      ":": "COLON",
+      "-": NOT,
+      ":": COLON,
+      "(": LPAREN,
+      ")": RPAREN,
   }
   c = s.peek()
   if c in punctuation:
@@ -71,26 +82,86 @@ def tokenize_regex(s):
   return ("REGEX", current)
 
 def tokenize_identifier(s):
-  keywords = {
+  conjunctions = {
       "AND",
       "OR",
   }
   current = s.advance()
   while is_alphanumeric(s.peek()):
     current += s.advance()
-  if current.upper() in keywords:
-    return ("KEYWORD", current.upper())
+  if current.upper() in conjunctions:
+    return ("CONJUNCTION", current.upper())
   else:
     return ("IDENTIFIER", current)
 
 ################################################################################
+# Parser
+################################################################################
+
+# EBNF
+# Note: we order expression types by ascending levels of precedence.
+#
+# expression  -> conjunction ;
+# conjunction -> selection ( ( "AND" | "OR" )? selection )* ;
+# selection   -> "-"? IDENTIFIER ":" ( REGEX | STRING ) | grouping ;
+# grouping    -> REGEX | STRING | "(" expression ")" ;
+
+def parse(x):
+  tokens = tokenize(x)
+  p = Parser(tokens)
+  return expression(p)
+
+def expression(p):
+  return conjunction(p)
+
+def conjunction(p):
+  lhs = selection(p)
+
+  while not p.exhausted() and p.test(lambda tokens, i: tokens[i] not in {LPAREN, RPAREN}):
+    conj = p.advance() if p.peek()[0] == "CONJUNCTION" else AND
+    rhs = selection(p)
+    lhs = ("CONJUNCTION", conj[1], lhs, rhs)
+
+  if not p.exhausted():
+    raise Exception("Encountered more tokens than we can parse: \"{}\"".format(p.tokens[p.i:]))
+
+  return lhs
+
+def selection(p):
+  negate = False
+  if p.peek() == NOT:
+    negate = True
+    p.advance()
+
+  if p.peek()[0] != "IDENTIFIER":
+    return grouping(p)
+
+  ident = p.expect(lambda x: x[0] == "IDENTIFIER")
+  colon = p.expect(lambda x: x[1] == "COLON")
+  value = p.expect(lambda x: x[0] in {"REGEX", "STRING"})
+  return ("SELECTION", negate, ident[1], value)
+
+def grouping(p):
+  if p.peek()[0] == "REGEX":
+    return p.advance()
+
+  if p.peek()[0] == "STRING":
+    return p.advance()
+
+  if p.peek() == LPAREN:
+    p.advance()
+    expr = expression(p)
+    p.expect(lambda x: x == RPAREN)
+    return ("GROUPING", expr)
+
+################################################################################
 # Main
 ################################################################################
 
 def main():
   while True:
     x = input("> ")
-    print(tokenize(x))
+    print(parse(x))
 
 if __name__ == "__main__":
   main()