about summary refs log tree commit diff
path: root/users/wpcarro/scratch/simple-select
diff options
context:
space:
mode:
Diffstat (limited to 'users/wpcarro/scratch/simple-select')
-rw-r--r--users/wpcarro/scratch/simple-select/README.md71
-rw-r--r--users/wpcarro/scratch/simple-select/main.py262
-rw-r--r--users/wpcarro/scratch/simple-select/parser.py31
-rw-r--r--users/wpcarro/scratch/simple-select/scanner.py27
4 files changed, 391 insertions, 0 deletions
diff --git a/users/wpcarro/scratch/simple-select/README.md b/users/wpcarro/scratch/simple-select/README.md
new file mode 100644
index 000000000000..69e5707302ab
--- /dev/null
+++ b/users/wpcarro/scratch/simple-select/README.md
@@ -0,0 +1,71 @@
+# Simple Select
+
+- Simple Select is a less expressive but more ergonomic query language for
+  tabular data than SQL.
+- `slx` is a command-line tool for querying CSVs using the Simple Select query
+  language.
+
+Simple Select queries look like this: `director:"Tarantino" OR director:"Scorsese"`.
+
+## Example
+
+Say we have the following data in a CSV:
+
+```csv
+title,year,rating,director
+"Spirited Away",2001,8.5,"Hayao Miyazaki"
+Andhadhun,2018,8.1,"Sriram Raghavan"
+Dangal,2016,8.3,"Sriram Raghavan"
+"Avengers: Infinity War",2019,8.4,"Anthony Russo"
+Alien,1979,8.4,"Ridley Scott"
+...
+```
+
+We can invoke `slx` like so...
+
+```
+$ slx -f /tmp/movies.csv
+```
+
+...and then query using the REPL:
+
+```
+> director:/S.*m/ OR director:"Hayao"
+Andhadhun       2018    8.1     1       Sriram Raghavan 0       1
+Dangal  2016    8.3     1       Sriram Raghavan 0       1
+Howls Moving Castle     2004    8.2     0       Hayao Miyazaki  1       1
+Judgment at Nuremberg   1961    8.1     0       Stanley Kramer  0       0
+Laputa: Castle in the Sky       1986    8.0     0       Hayao Miyazaki  1       1
+Nausicaa of the Valley of the Wind      1984    8.0     0       Hayao Miyazaki  1       1
+Network 1976    8.1     0       Sidney Lumet    0       0
+```
+
+## Warning
+
+Simple Select is **not intended for production use**. I wrote this as a toy
+project for my own consumption. There are quite a few bugs of which I'm aware
+and quite a few other features that I'd like to support but haven't had time to
+support just yet.
+
+Why publish it then? Maybe this project will inspire drive-by contributions or
+other, better-implemented spin-offs.
+
+## Wish List
+
+Speaking of drive-by contributions, here are some things that I'd like to
+support:
+
+- Implicit `AND` conjunctions (`director:/Tarantino/ year:"2000"` instead of
+  `director:/Tarantino/ AND year:"2000"`)
+- Support for types like numbers, dates (`year:2000` instead of `year:"2000"`)
+- `slx` should support CSV *and* (at the very least) sqlite3 file formats (open
+  to other formats as well)
+- Regexes should be the default query primitive (`director:Tarantino` instead of
+  `director:/Tarantino/`)
+- Improve parsing errors (including surfacing errors to the user)
+- Support for reading from `STDIN` and issuing queries from the command-line
+- Unit-testing
+- Configurable delimiters for output data (right now it's just `\t`)
+- (Maybe) rewrite in a faster, more-type-safe languages (e.g. Rust)
+
+I'm likely missing other FRs, bugs, so please file issues!
diff --git a/users/wpcarro/scratch/simple-select/main.py b/users/wpcarro/scratch/simple-select/main.py
new file mode 100644
index 000000000000..3ae6c5d60e08
--- /dev/null
+++ b/users/wpcarro/scratch/simple-select/main.py
@@ -0,0 +1,262 @@
+from argparse import ArgumentParser
+
+import csv
+from parser import Parser
+import sqlite3
+import string
+from scanner import Scanner
+import re
+import readline
+
+################################################################################
+# Predicates
+################################################################################
+
+def is_alpha(c):
+  return c in string.ascii_letters
+
+def is_digit(c):
+  return c in "0123456789"
+
+def is_alphanumeric(c):
+  return is_alpha(c) or is_digit(c)
+
+def is_whitespace(c):
+  return c in " \r\t\n"
+
+################################################################################
+# Tokenizer
+################################################################################
+
+AND    = ("CONJUNCTION", "AND")
+OR     = ("CONJUNCTION", "OR")
+NOT    = ("PUNCTUATION", "NOT")
+COLON  = ("PUNCTUATION", "COLON")
+LPAREN = ("PUNCTUATION", "LPAREN")
+RPAREN = ("PUNCTUATION", "RPAREN")
+
+def tokenize(x):
+  s = Scanner(x)
+  tokens = scan_tokens(s)
+  return tokens
+
+def scan_tokens(s):
+  result = []
+  while not s.exhausted():
+    if is_whitespace(s.peek()):
+      s.advance()
+    else:
+      result.append(scan_token(s))
+  return result
+
+def scan_token(s):
+  punctuation = {
+      "-": NOT,
+      ":": COLON,
+      "(": LPAREN,
+      ")": RPAREN,
+  }
+  c = s.peek()
+  if c in punctuation:
+    s.advance()
+    return punctuation[c]
+  if c == "\"":
+    return tokenize_string(s)
+  if c == "/":
+    return tokenize_regex(s)
+  if is_alpha(c):
+    return tokenize_identifier(s)
+
+def tokenize_string(s):
+  s.advance() # ignore opening 2x-quote
+  current = ""
+  while s.peek() != "\"" and not s.exhausted():
+    current += s.advance()
+  if s.exhausted():
+    raise Exception("Unterminated string")
+  s.advance() # ignore closing 2x-quote
+  return ("STRING", current)
+
+def tokenize_regex(s):
+  s.advance() # ignore opening forward-slash
+  current = ""
+  while s.peek() != "/" and not s.exhausted():
+    current += s.advance()
+  if s.exhausted():
+    raise Exception("Unterminated regex")
+  s.advance() # ignore closing forward-slash
+  return ("REGEX", current)
+
+def tokenize_identifier(s):
+  conjunctions = {
+      "AND",
+      "OR",
+  }
+  current = s.advance()
+  while is_alphanumeric(s.peek()):
+    current += s.advance()
+  if current.upper() in conjunctions:
+    return ("CONJUNCTION", current.upper())
+  else:
+    return ("IDENTIFIER", current)
+
+################################################################################
+# Parser
+################################################################################
+
+# EBNF
+# Note: we order expression types by ascending levels of precedence.
+#
+# expression  -> conjunction ;
+# conjunction -> selection ( ( "AND" | "OR" )? selection )* ;
+# selection   -> "-"? IDENTIFIER ":" ( REGEX | STRING ) | grouping ;
+# grouping    -> REGEX | STRING | "(" expression ")" ;
+
+def parse(x):
+  tokens = tokenize(x)
+  p = Parser(tokens)
+  return expression(p)
+
+def expression(p):
+  return conjunction(p)
+
+def conjunction(p):
+  lhs = selection(p)
+
+  # TODO(wpcarro): Support default AND conjuctions when they're undefined.
+  while not p.exhausted() and p.match({AND, OR}):
+    conj = p.peek(n=-1)
+    rhs = selection(p)
+    lhs = ("CONJUNCTION", conj[1], lhs, rhs)
+
+  return lhs
+
+def selection(p):
+  negate = False
+  if p.peek() == NOT:
+    negate = True
+    p.advance()
+
+  if p.peek()[0] != "IDENTIFIER":
+    return grouping(p)
+
+  ident = p.expect(lambda x: x[0] == "IDENTIFIER")
+  colon = p.expect(lambda x: x[1] == "COLON")
+  value = p.expect(lambda x: x[0] in {"REGEX", "STRING"})
+  return ("SELECTION", negate, ident[1], value)
+
+def grouping(p):
+  if p.peek()[0] == "REGEX":
+    return p.advance()
+
+  if p.peek()[0] == "STRING":
+    return p.advance()
+
+  if p.peek() == LPAREN:
+    p.advance()
+    expr = expression(p)
+    p.expect(lambda x: x == RPAREN)
+    return ("GROUPING", expr)
+
+################################################################################
+# Compiler
+################################################################################
+
+def compile(source, table, columns):
+  ast = parse(source)
+  return "SELECT * FROM {} WHERE {};".format(table, do_compile(ast, columns))
+
+def do_compile(ast, columns):
+  if ast[0] == "REGEX":
+    cols = "({})".format(" || ".join(columns))
+    return "{} REGEXP '.*{}.*'".format(cols, ast[1])
+
+  if ast[0] == "STRING":
+    cols = "({})".format(" || ".join(columns))
+    return "{} LIKE '%{}%'".format(cols, ast[1])
+
+  if ast[0] == "SELECTION":
+    return compile_selection(ast)
+
+  if ast[0] == "CONJUNCTION":
+    _, conj, lhs, rhs = ast
+    lhs = do_compile(lhs, columns)
+    rhs = do_compile(rhs, columns)
+    return "{} {} {}".format(lhs, conj, rhs)
+
+  if ast[0] == "GROUPING":
+    return "({})".format(do_compile(ast[1], columns))
+
+  raise Exception("Unexpected AST: \"{}\"".format(ast))
+
+def compile_selection(ast):
+  _, negate, column, query = ast
+  match = compile_query(negate, query)
+  return "{} {}".format(column, match)
+
+def compile_query(negate, query):
+  query_type, query_string = query
+  if query_type == "REGEX":
+    if negate:
+      return "NOT REGEXP '.*{}.*'".format(query_string)
+    return "REGEXP '.*{}.*'".format(query_string)
+
+  if query_type == "STRING":
+    if negate:
+      return "NOT LIKE '%{}%'".format(query_string)
+    return "LIKE '%{}%'".format(query_string)
+
+################################################################################
+# Helper Functions
+################################################################################
+
+def regexp(expr, x):
+  reg = re.compile(expr)
+  return reg.search(x) is not None
+
+################################################################################
+# Main
+################################################################################
+
+def main(csv_path=None, debug=False):
+  # Import CSV to SQLite
+  table = "main"
+  con = sqlite3.connect(":memory:")
+
+  con.create_function("REGEXP", 2, regexp)
+
+  cur = con.cursor()
+  with open(csv_path, "r") as f:
+    r = csv.DictReader(f)
+    columns = next(r).keys()
+
+    # TODO(wpcarro): Use safer interpolation variant of "?" here and throughout.
+    cur.execute("CREATE TABLE {} ({});".format(table, ",".join(columns)))
+    rows = [tuple(row[col] for col in columns) for row in r]
+    cur.executemany("INSERT INTO {} ({}) VALUES ({});".format(table, ",".join(columns), ",".join("?" for _ in columns)), rows)
+    con.commit()
+
+  while True:
+    x = input("> ")
+
+    if debug:
+      print("tokens:\t{}".format(tokenize(x)))
+      print("AST:\t{}".format(parse(x)))
+      print("query:\t\"{}\"".format(compile(x, table, columns)))
+
+    try:
+      compile(x, table, columns)
+      for row in cur.execute(compile(x, table, columns)):
+        print("\t".join(str(cell) for cell in row))
+    except:
+      print("Compilation error.")
+
+  # TODO(wpcarro): Trap exits and ensure cleanup always runs.
+  con.close()
+
+if __name__ == "__main__":
+  parser = ArgumentParser()
+  parser.add_argument("-f", "--file", dest="file", help="Path to the CSV from which to read", metavar="PATH")
+  parser.add_argument("-d", "--debug", dest="debug", default=False, action="store_true", help="Enable debugging")
+  args = parser.parse_args()
+  main(csv_path=args.file, debug=args.debug)
diff --git a/users/wpcarro/scratch/simple-select/parser.py b/users/wpcarro/scratch/simple-select/parser.py
new file mode 100644
index 000000000000..d26f970e57a2
--- /dev/null
+++ b/users/wpcarro/scratch/simple-select/parser.py
@@ -0,0 +1,31 @@
+class Parser(object):
+    def __init__(self, tokens):
+        self.tokens = tokens
+        self.i = 0
+
+    def exhausted(self):
+        return self.i >= len(self.tokens)
+
+    def peek(self, n=0):
+        return self.tokens[self.i + n]
+
+    def advance(self):
+        if not self.exhausted():
+            self.i += 1
+        return self.peek(n=-1)
+
+    def match(self, xs):
+        if self.peek() in xs:
+            self.advance()
+            return True
+        return False
+
+    def test(self, predicate):
+        return predicate(self.tokens, self.i)
+
+    def expect(self, predicate):
+        if self.exhausted():
+            raise Exception("Unexpected EOL")
+        if predicate(self.peek()):
+            return self.advance()
+        raise Exception("Unexpected token: \"{}\"".format(self.peek()))
diff --git a/users/wpcarro/scratch/simple-select/scanner.py b/users/wpcarro/scratch/simple-select/scanner.py
new file mode 100644
index 000000000000..5dae68aee551
--- /dev/null
+++ b/users/wpcarro/scratch/simple-select/scanner.py
@@ -0,0 +1,27 @@
+# According to Crafting Interpreters, the only two primitives that a
+# scanner/lexer needs are peek and advance; other functions (e.g. match) are
+# nice-to-haves.
+class Scanner(object):
+  def __init__(self, chars):
+    self.i = 0
+    self.chars = chars
+
+  def exhausted(self):
+    return self.i >= len(self.chars)
+
+  def peek(self, n=0):
+    return self.chars[self.i + n] if self.i in range(0, len(self.chars)) else '\0'
+
+  def advance(self):
+    result = self.peek()
+    self.i += 1
+    return result
+
+  def match(self, x):
+    if self.exhausted():
+      return False
+    if self.peek() == x:
+      self.advance()
+      return True
+    else:
+      return False