diff options
Diffstat (limited to 'users/wpcarro/scratch')
-rw-r--r-- | users/wpcarro/scratch/simple-select/main.py | 85 | ||||
-rw-r--r-- | users/wpcarro/scratch/simple-select/scanner.py | 8 |
2 files changed, 88 insertions, 5 deletions
diff --git a/users/wpcarro/scratch/simple-select/main.py b/users/wpcarro/scratch/simple-select/main.py index 6a86324ef73f..0aea8dcffc5e 100644 --- a/users/wpcarro/scratch/simple-select/main.py +++ b/users/wpcarro/scratch/simple-select/main.py @@ -1,8 +1,91 @@ +import string from scanner import Scanner +################################################################################ +# Predicates +################################################################################ + +def is_alpha(c): + return c in string.ascii_letters + +def is_digit(c): + return c in "0123456789" + +def is_alphanumeric(c): + return is_alpha(c) or is_digit(c) + +def is_whitespace(c): + return c in " \r\t\n" + +################################################################################ +# Tokenizer +################################################################################ def tokenize(x): s = Scanner(x) - return None + tokens = scan_tokens(s) + return tokens + +def scan_tokens(s): + result = [] + while not s.exhausted(): + if is_whitespace(s.peek()): + s.advance() + else: + result.append(scan_token(s)) + return result + +def scan_token(s): + punctuation = { + "-": "NOT", + ":": "COLON", + } + c = s.peek() + if c in punctuation: + s.advance() + return punctuation[c] + if c == "\"": + return tokenize_string(s) + if c == "/": + return tokenize_regex(s) + if is_alpha(c): + return tokenize_identifier(s) + +def tokenize_string(s): + s.advance() # ignore opening 2x-quote + current = "" + while s.peek() != "\"" and not s.exhausted(): + current += s.advance() + if s.exhausted(): + raise Exception("Unterminated string") + s.advance() # ignore closing 2x-quote + return ("STRING", current) + +def tokenize_regex(s): + s.advance() # ignore opening forward-slash + current = "" + while s.peek() != "/" and not s.exhausted(): + current += s.advance() + if s.exhausted(): + raise Exception("Unterminated regex") + s.advance() # ignore closing forward-slash + return ("REGEX", current) + +def tokenize_identifier(s): + keywords = { + "AND", + "OR", + } + current = s.advance() + while is_alphanumeric(s.peek()): + current += s.advance() + if current.upper() in keywords: + return ("KEYWORD", current.upper()) + else: + return ("IDENTIFIER", current) + +################################################################################ +# Main +################################################################################ def main(): while True: diff --git a/users/wpcarro/scratch/simple-select/scanner.py b/users/wpcarro/scratch/simple-select/scanner.py index 96704ec1ab3c..5dae68aee551 100644 --- a/users/wpcarro/scratch/simple-select/scanner.py +++ b/users/wpcarro/scratch/simple-select/scanner.py @@ -2,15 +2,15 @@ # scanner/lexer needs are peek and advance; other functions (e.g. match) are # nice-to-haves. class Scanner(object): - def __init__(self, source): + def __init__(self, chars): self.i = 0 - self.source = source + self.chars = chars def exhausted(self): - return self.i >= len(self.source) + return self.i >= len(self.chars) def peek(self, n=0): - return self.source[self.i + n] if self.i + n < len(self.source) else '\0' + return self.chars[self.i + n] if self.i in range(0, len(self.chars)) else '\0' def advance(self): result = self.peek() |