diff options
Diffstat (limited to 'users/wpcarro/scratch/simple-select')
-rw-r--r-- | users/wpcarro/scratch/simple-select/README.md | 71 | ||||
-rw-r--r-- | users/wpcarro/scratch/simple-select/main.py | 262 | ||||
-rw-r--r-- | users/wpcarro/scratch/simple-select/parser.py | 31 | ||||
-rw-r--r-- | users/wpcarro/scratch/simple-select/scanner.py | 27 |
4 files changed, 391 insertions, 0 deletions
diff --git a/users/wpcarro/scratch/simple-select/README.md b/users/wpcarro/scratch/simple-select/README.md new file mode 100644 index 000000000000..69e5707302ab --- /dev/null +++ b/users/wpcarro/scratch/simple-select/README.md @@ -0,0 +1,71 @@ +# Simple Select + +- Simple Select is a less expressive but more ergonomic query language for + tabular data than SQL. +- `slx` is a command-line tool for querying CSVs using the Simple Select query + language. + +Simple Select queries look like this: `director:"Tarantino" OR director:"Scorsese"`. + +## Example + +Say we have the following data in a CSV: + +```csv +title,year,rating,director +"Spirited Away",2001,8.5,"Hayao Miyazaki" +Andhadhun,2018,8.1,"Sriram Raghavan" +Dangal,2016,8.3,"Sriram Raghavan" +"Avengers: Infinity War",2019,8.4,"Anthony Russo" +Alien,1979,8.4,"Ridley Scott" +... +``` + +We can invoke `slx` like so... + +``` +$ slx -f /tmp/movies.csv +``` + +...and then query using the REPL: + +``` +> director:/S.*m/ OR director:"Hayao" +Andhadhun 2018 8.1 1 Sriram Raghavan 0 1 +Dangal 2016 8.3 1 Sriram Raghavan 0 1 +Howls Moving Castle 2004 8.2 0 Hayao Miyazaki 1 1 +Judgment at Nuremberg 1961 8.1 0 Stanley Kramer 0 0 +Laputa: Castle in the Sky 1986 8.0 0 Hayao Miyazaki 1 1 +Nausicaa of the Valley of the Wind 1984 8.0 0 Hayao Miyazaki 1 1 +Network 1976 8.1 0 Sidney Lumet 0 0 +``` + +## Warning + +Simple Select is **not intended for production use**. I wrote this as a toy +project for my own consumption. There are quite a few bugs of which I'm aware +and quite a few other features that I'd like to support but haven't had time to +support just yet. + +Why publish it then? Maybe this project will inspire drive-by contributions or +other, better-implemented spin-offs. + +## Wish List + +Speaking of drive-by contributions, here are some things that I'd like to +support: + +- Implicit `AND` conjunctions (`director:/Tarantino/ year:"2000"` instead of + `director:/Tarantino/ AND year:"2000"`) +- Support for types like numbers, dates (`year:2000` instead of `year:"2000"`) +- `slx` should support CSV *and* (at the very least) sqlite3 file formats (open + to other formats as well) +- Regexes should be the default query primitive (`director:Tarantino` instead of + `director:/Tarantino/`) +- Improve parsing errors (including surfacing errors to the user) +- Support for reading from `STDIN` and issuing queries from the command-line +- Unit-testing +- Configurable delimiters for output data (right now it's just `\t`) +- (Maybe) rewrite in a faster, more-type-safe languages (e.g. Rust) + +I'm likely missing other FRs, bugs, so please file issues! diff --git a/users/wpcarro/scratch/simple-select/main.py b/users/wpcarro/scratch/simple-select/main.py new file mode 100644 index 000000000000..3ae6c5d60e08 --- /dev/null +++ b/users/wpcarro/scratch/simple-select/main.py @@ -0,0 +1,262 @@ +from argparse import ArgumentParser + +import csv +from parser import Parser +import sqlite3 +import string +from scanner import Scanner +import re +import readline + +################################################################################ +# Predicates +################################################################################ + +def is_alpha(c): + return c in string.ascii_letters + +def is_digit(c): + return c in "0123456789" + +def is_alphanumeric(c): + return is_alpha(c) or is_digit(c) + +def is_whitespace(c): + return c in " \r\t\n" + +################################################################################ +# Tokenizer +################################################################################ + +AND = ("CONJUNCTION", "AND") +OR = ("CONJUNCTION", "OR") +NOT = ("PUNCTUATION", "NOT") +COLON = ("PUNCTUATION", "COLON") +LPAREN = ("PUNCTUATION", "LPAREN") +RPAREN = ("PUNCTUATION", "RPAREN") + +def tokenize(x): + s = Scanner(x) + tokens = scan_tokens(s) + return tokens + +def scan_tokens(s): + result = [] + while not s.exhausted(): + if is_whitespace(s.peek()): + s.advance() + else: + result.append(scan_token(s)) + return result + +def scan_token(s): + punctuation = { + "-": NOT, + ":": COLON, + "(": LPAREN, + ")": RPAREN, + } + c = s.peek() + if c in punctuation: + s.advance() + return punctuation[c] + if c == "\"": + return tokenize_string(s) + if c == "/": + return tokenize_regex(s) + if is_alpha(c): + return tokenize_identifier(s) + +def tokenize_string(s): + s.advance() # ignore opening 2x-quote + current = "" + while s.peek() != "\"" and not s.exhausted(): + current += s.advance() + if s.exhausted(): + raise Exception("Unterminated string") + s.advance() # ignore closing 2x-quote + return ("STRING", current) + +def tokenize_regex(s): + s.advance() # ignore opening forward-slash + current = "" + while s.peek() != "/" and not s.exhausted(): + current += s.advance() + if s.exhausted(): + raise Exception("Unterminated regex") + s.advance() # ignore closing forward-slash + return ("REGEX", current) + +def tokenize_identifier(s): + conjunctions = { + "AND", + "OR", + } + current = s.advance() + while is_alphanumeric(s.peek()): + current += s.advance() + if current.upper() in conjunctions: + return ("CONJUNCTION", current.upper()) + else: + return ("IDENTIFIER", current) + +################################################################################ +# Parser +################################################################################ + +# EBNF +# Note: we order expression types by ascending levels of precedence. +# +# expression -> conjunction ; +# conjunction -> selection ( ( "AND" | "OR" )? selection )* ; +# selection -> "-"? IDENTIFIER ":" ( REGEX | STRING ) | grouping ; +# grouping -> REGEX | STRING | "(" expression ")" ; + +def parse(x): + tokens = tokenize(x) + p = Parser(tokens) + return expression(p) + +def expression(p): + return conjunction(p) + +def conjunction(p): + lhs = selection(p) + + # TODO(wpcarro): Support default AND conjuctions when they're undefined. + while not p.exhausted() and p.match({AND, OR}): + conj = p.peek(n=-1) + rhs = selection(p) + lhs = ("CONJUNCTION", conj[1], lhs, rhs) + + return lhs + +def selection(p): + negate = False + if p.peek() == NOT: + negate = True + p.advance() + + if p.peek()[0] != "IDENTIFIER": + return grouping(p) + + ident = p.expect(lambda x: x[0] == "IDENTIFIER") + colon = p.expect(lambda x: x[1] == "COLON") + value = p.expect(lambda x: x[0] in {"REGEX", "STRING"}) + return ("SELECTION", negate, ident[1], value) + +def grouping(p): + if p.peek()[0] == "REGEX": + return p.advance() + + if p.peek()[0] == "STRING": + return p.advance() + + if p.peek() == LPAREN: + p.advance() + expr = expression(p) + p.expect(lambda x: x == RPAREN) + return ("GROUPING", expr) + +################################################################################ +# Compiler +################################################################################ + +def compile(source, table, columns): + ast = parse(source) + return "SELECT * FROM {} WHERE {};".format(table, do_compile(ast, columns)) + +def do_compile(ast, columns): + if ast[0] == "REGEX": + cols = "({})".format(" || ".join(columns)) + return "{} REGEXP '.*{}.*'".format(cols, ast[1]) + + if ast[0] == "STRING": + cols = "({})".format(" || ".join(columns)) + return "{} LIKE '%{}%'".format(cols, ast[1]) + + if ast[0] == "SELECTION": + return compile_selection(ast) + + if ast[0] == "CONJUNCTION": + _, conj, lhs, rhs = ast + lhs = do_compile(lhs, columns) + rhs = do_compile(rhs, columns) + return "{} {} {}".format(lhs, conj, rhs) + + if ast[0] == "GROUPING": + return "({})".format(do_compile(ast[1], columns)) + + raise Exception("Unexpected AST: \"{}\"".format(ast)) + +def compile_selection(ast): + _, negate, column, query = ast + match = compile_query(negate, query) + return "{} {}".format(column, match) + +def compile_query(negate, query): + query_type, query_string = query + if query_type == "REGEX": + if negate: + return "NOT REGEXP '.*{}.*'".format(query_string) + return "REGEXP '.*{}.*'".format(query_string) + + if query_type == "STRING": + if negate: + return "NOT LIKE '%{}%'".format(query_string) + return "LIKE '%{}%'".format(query_string) + +################################################################################ +# Helper Functions +################################################################################ + +def regexp(expr, x): + reg = re.compile(expr) + return reg.search(x) is not None + +################################################################################ +# Main +################################################################################ + +def main(csv_path=None, debug=False): + # Import CSV to SQLite + table = "main" + con = sqlite3.connect(":memory:") + + con.create_function("REGEXP", 2, regexp) + + cur = con.cursor() + with open(csv_path, "r") as f: + r = csv.DictReader(f) + columns = next(r).keys() + + # TODO(wpcarro): Use safer interpolation variant of "?" here and throughout. + cur.execute("CREATE TABLE {} ({});".format(table, ",".join(columns))) + rows = [tuple(row[col] for col in columns) for row in r] + cur.executemany("INSERT INTO {} ({}) VALUES ({});".format(table, ",".join(columns), ",".join("?" for _ in columns)), rows) + con.commit() + + while True: + x = input("> ") + + if debug: + print("tokens:\t{}".format(tokenize(x))) + print("AST:\t{}".format(parse(x))) + print("query:\t\"{}\"".format(compile(x, table, columns))) + + try: + compile(x, table, columns) + for row in cur.execute(compile(x, table, columns)): + print("\t".join(str(cell) for cell in row)) + except: + print("Compilation error.") + + # TODO(wpcarro): Trap exits and ensure cleanup always runs. + con.close() + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("-f", "--file", dest="file", help="Path to the CSV from which to read", metavar="PATH") + parser.add_argument("-d", "--debug", dest="debug", default=False, action="store_true", help="Enable debugging") + args = parser.parse_args() + main(csv_path=args.file, debug=args.debug) diff --git a/users/wpcarro/scratch/simple-select/parser.py b/users/wpcarro/scratch/simple-select/parser.py new file mode 100644 index 000000000000..d26f970e57a2 --- /dev/null +++ b/users/wpcarro/scratch/simple-select/parser.py @@ -0,0 +1,31 @@ +class Parser(object): + def __init__(self, tokens): + self.tokens = tokens + self.i = 0 + + def exhausted(self): + return self.i >= len(self.tokens) + + def peek(self, n=0): + return self.tokens[self.i + n] + + def advance(self): + if not self.exhausted(): + self.i += 1 + return self.peek(n=-1) + + def match(self, xs): + if self.peek() in xs: + self.advance() + return True + return False + + def test(self, predicate): + return predicate(self.tokens, self.i) + + def expect(self, predicate): + if self.exhausted(): + raise Exception("Unexpected EOL") + if predicate(self.peek()): + return self.advance() + raise Exception("Unexpected token: \"{}\"".format(self.peek())) diff --git a/users/wpcarro/scratch/simple-select/scanner.py b/users/wpcarro/scratch/simple-select/scanner.py new file mode 100644 index 000000000000..5dae68aee551 --- /dev/null +++ b/users/wpcarro/scratch/simple-select/scanner.py @@ -0,0 +1,27 @@ +# According to Crafting Interpreters, the only two primitives that a +# scanner/lexer needs are peek and advance; other functions (e.g. match) are +# nice-to-haves. +class Scanner(object): + def __init__(self, chars): + self.i = 0 + self.chars = chars + + def exhausted(self): + return self.i >= len(self.chars) + + def peek(self, n=0): + return self.chars[self.i + n] if self.i in range(0, len(self.chars)) else '\0' + + def advance(self): + result = self.peek() + self.i += 1 + return result + + def match(self, x): + if self.exhausted(): + return False + if self.peek() == x: + self.advance() + return True + else: + return False |