about summary refs log tree commit diff
path: root/users/wpcarro/scratch/simple-select/main.py
diff options
context:
space:
mode:
Diffstat (limited to 'users/wpcarro/scratch/simple-select/main.py')
-rw-r--r--users/wpcarro/scratch/simple-select/main.py85
1 files changed, 84 insertions, 1 deletions
diff --git a/users/wpcarro/scratch/simple-select/main.py b/users/wpcarro/scratch/simple-select/main.py
index 6a86324ef73f..0aea8dcffc5e 100644
--- a/users/wpcarro/scratch/simple-select/main.py
+++ b/users/wpcarro/scratch/simple-select/main.py
@@ -1,8 +1,91 @@
+import string
 from scanner import Scanner
+################################################################################
+# Predicates
+################################################################################
+
+def is_alpha(c):
+  return c in string.ascii_letters
+
+def is_digit(c):
+  return c in "0123456789"
+
+def is_alphanumeric(c):
+  return is_alpha(c) or is_digit(c)
+
+def is_whitespace(c):
+  return c in " \r\t\n"
+
+################################################################################
+# Tokenizer
+################################################################################
 
 def tokenize(x):
   s = Scanner(x)
-  return None
+  tokens = scan_tokens(s)
+  return tokens
+
+def scan_tokens(s):
+  result = []
+  while not s.exhausted():
+    if is_whitespace(s.peek()):
+      s.advance()
+    else:
+      result.append(scan_token(s))
+  return result
+
+def scan_token(s):
+  punctuation = {
+      "-": "NOT",
+      ":": "COLON",
+  }
+  c = s.peek()
+  if c in punctuation:
+    s.advance()
+    return punctuation[c]
+  if c == "\"":
+    return tokenize_string(s)
+  if c == "/":
+    return tokenize_regex(s)
+  if is_alpha(c):
+    return tokenize_identifier(s)
+
+def tokenize_string(s):
+  s.advance() # ignore opening 2x-quote
+  current = ""
+  while s.peek() != "\"" and not s.exhausted():
+    current += s.advance()
+  if s.exhausted():
+    raise Exception("Unterminated string")
+  s.advance() # ignore closing 2x-quote
+  return ("STRING", current)
+
+def tokenize_regex(s):
+  s.advance() # ignore opening forward-slash
+  current = ""
+  while s.peek() != "/" and not s.exhausted():
+    current += s.advance()
+  if s.exhausted():
+    raise Exception("Unterminated regex")
+  s.advance() # ignore closing forward-slash
+  return ("REGEX", current)
+
+def tokenize_identifier(s):
+  keywords = {
+      "AND",
+      "OR",
+  }
+  current = s.advance()
+  while is_alphanumeric(s.peek()):
+    current += s.advance()
+  if current.upper() in keywords:
+    return ("KEYWORD", current.upper())
+  else:
+    return ("IDENTIFIER", current)
+
+################################################################################
+# Main
+################################################################################
 
 def main():
   while True: