about summary refs log tree commit diff
path: root/users/wpcarro/scratch/facebook/parsing
diff options
context:
space:
mode:
Diffstat (limited to 'users/wpcarro/scratch/facebook/parsing')
-rw-r--r--users/wpcarro/scratch/facebook/parsing/json.py121
-rw-r--r--users/wpcarro/scratch/facebook/parsing/parser.py28
-rw-r--r--users/wpcarro/scratch/facebook/parsing/regex.py184
3 files changed, 333 insertions, 0 deletions
diff --git a/users/wpcarro/scratch/facebook/parsing/json.py b/users/wpcarro/scratch/facebook/parsing/json.py
new file mode 100644
index 0000000000..3975e973fe
--- /dev/null
+++ b/users/wpcarro/scratch/facebook/parsing/json.py
@@ -0,0 +1,121 @@
+from parser import Parser
+
+# As an exercise to stress-test my understanding of recursive descent parsers,
+# I'm attempting to write a JSON parser without referencing any existing BNF
+# descriptions of JSON or existing JSON parser implementations.
+#
+# I'm only parsing a subset of JSON: enough to parse `sample`. Here is the BNF
+# that I wrote to describe my expected input:
+#
+# expression -> object
+# object     -> '{' ( STRING ':' expression ) ( ',' STRING ':' expression )* '}'
+#            |  array
+# array      -> '[' expression ( ',' expression )* ']'
+#            |  literal
+# literal    -> STRING | INT
+
+def tokenize(xs):
+    """
+    Return a list of tokens from the string input, `xs`.
+    """
+    result = []
+    i = 0
+    while i < len(xs):
+        # single characters
+        if xs[i] in ",{}:[]":
+            result.append(xs[i])
+            i += 1
+        # strings
+        elif xs[i] == "\"":
+            curr = xs[i]
+            i += 1
+            while xs[i] != "\"":
+                curr += xs[i]
+                i += 1
+            curr += xs[i]
+            result.append(curr)
+            i += 1
+        # integers
+        elif xs[i] in "0123456789":
+            curr = xs[i]
+            i += 1
+            while xs[i] in "0123456789":
+                curr += xs[i]
+                i += 1
+            result.append(int(curr))
+        # whitespace
+        elif xs[i] in {" ", "\n"}:
+            i += 1
+    return result
+
+def parse_json(x):
+    """
+    Attempt to parse the string, `x`, into JSON.
+    """
+    tokens = tokenize(x)
+    return parse_object(Parser(tokens))
+
+def parse_object(parser):
+    if parser.match(['{']):
+        key = parse_string(parser)
+        parser.expect([':'])
+        value = parse_object(parser)
+        result = [(key, value)]
+        while parser.match([',']):
+            key = parse_string(parser)
+            parser.match([':'])
+            value = parse_object(parser)
+            result.append((key, value))
+        return result
+    return parse_array(parser)
+
+def parse_array(parser):
+    if parser.match(['[']):
+        if parser.match([']']):
+            return []
+        result = [parse_object(parser)]
+        while parser.match([',']):
+            result.append(parse_object(parser))
+        parser.expect([']'])
+        return result
+    else:
+        return parse_literal(parser)
+
+def parse_string(parser):
+    if parser.curr().startswith("\""):
+        return parser.consume()
+    else:
+        raise Exception("Unexpected token: {}".format(parser.curr()))
+
+def parse_literal(parser):
+    return parser.consume()
+
+sample = """
+{
+  "glossary": {
+    "title": "example glossary",
+    "GlossDiv": {
+      "title": "S",
+      "GlossList": {
+        "GlossEntry": {
+          "ID": "SGML",
+          "SortAs": "SGML",
+          "GlossTerm": "Standard Generalized Markup Language",
+          "Acronym": "SGML",
+          "Abbrev": "ISO 8879:1986",
+          "GlossDef": {
+            "para": "A meta-markup language, used to create markup languages such as DocBook.",
+            "GlossSeeAlso": [
+              "GML",
+              "XML"
+            ]
+          },
+          "GlossSee": "markup"
+        }
+      }
+    }
+  }
+}
+"""
+
+print(parse_json(sample))
diff --git a/users/wpcarro/scratch/facebook/parsing/parser.py b/users/wpcarro/scratch/facebook/parsing/parser.py
new file mode 100644
index 0000000000..407bff61c9
--- /dev/null
+++ b/users/wpcarro/scratch/facebook/parsing/parser.py
@@ -0,0 +1,28 @@
+class Parser(object):
+    def __init__(self, tokens):
+        self.tokens = tokens
+        self.i = 0
+
+    def prev(self):
+        return self.tokens[self.i - 1]
+
+    def curr(self):
+        return self.tokens[self.i]
+
+    def consume(self):
+        if not self.exhausted():
+            self.i += 1
+            return self.prev()
+
+    def match(self, xs):
+        if not self.exhausted() and self.curr() in xs:
+            self.consume()
+            return True
+        return False
+
+    def expect(self, xs):
+        if not self.match(xs):
+            raise Exception("Expected token \"{}\" but received \"{}\"".format(xs, self.curr()))
+
+    def exhausted(self):
+        return self.i >= len(self.tokens)
diff --git a/users/wpcarro/scratch/facebook/parsing/regex.py b/users/wpcarro/scratch/facebook/parsing/regex.py
new file mode 100644
index 0000000000..7fc2ef34e2
--- /dev/null
+++ b/users/wpcarro/scratch/facebook/parsing/regex.py
@@ -0,0 +1,184 @@
+# Writing a small proof-of-concept...
+#   - lexer
+#   - parser
+#   - compiler
+# ...for regex.
+#
+# BNF
+# expression -> ( char_class | CHAR ) quantifier? ( "|" expression )*
+# char_class -> "[" CHAR+ "]"
+# quantifier -> "?" | "*" | "+" | "{" INT? "," INT? "}"
+#
+# Of the numerous things I do not support, here are a few items of which I'm
+# aware:
+#   - alternatives:   (a|b)
+#   - capture groups: (ab)cd
+
+from parser import Parser
+import string
+
+################################################################################
+# Top-Level API
+################################################################################
+
+def tokenize(xs):
+    """
+    Transform `xs` into a list of tokens.
+
+    Also: expand shorthand symbols using the following table:
+      - ? -> {0,1}
+      - * -> {0,}
+      - + -> {1,}
+    """
+    result = []
+    i = 0
+    shorthand = {
+        "?": ["{", 0, ",", 1, "}"],
+        "*": ["{", 0, ",", "}"],
+        "+": ["{", 1, ",", "}"],
+    }
+    while i < len(xs):
+        if xs[i] in shorthand:
+            for c in shorthand[xs[i]]:
+                result.append(c)
+            i += 1
+        elif xs[i] == "{":
+            result.append(xs[i])
+            i += 1
+            curr = ""
+            while xs[i] in string.digits:
+                curr += xs[i]
+                i += 1
+            result.append(int(curr))
+            assert xs[i] == ","
+            result.append(",")
+            i += 1
+            curr = ""
+            while xs[i] in string.digits:
+                curr += xs[i]
+                i += 1
+            result.append(int(curr))
+        else:
+            result.append(xs[i])
+            i += 1
+    return result
+
+def parse(expr):
+    """
+    Tokenize `expr` and convert it into a parse-tree.
+    """
+    tokens = tokenize(expr)
+    return parse_tokens(tokens)
+
+def compile(xs):
+    """
+    Transform `xs`, a parse-tree representing a regex, into a function that
+    accepts a string, and returns the substring that the regex matches.
+    """
+    def fn(input):
+        match = ""
+        i = 0
+        for x in xs:
+            matches, q = x[1], x[2]
+            lo, hi = q[1], q[2]
+            for j in range(lo):
+                if i < len(input) and input[i] in matches:
+                    match += input[i]
+                    i += 1
+                else:
+                    print("Failed to match {} with {}".format(input[i], matches))
+                    return None
+            if hi == float('inf'):
+                while i < len(input) and input[i] in matches:
+                    match += input[i]
+                    i += 1
+            else:
+                for j in range(hi - lo):
+                    if i < len(input) and input[i] in matches:
+                        match += input[i]
+                        i += 1
+        return match
+    return fn
+
+################################################################################
+# Helper Functions
+################################################################################
+
+def parse_tokens(tokens):
+    result = []
+    parser = Parser(tokens)
+    while not parser.exhausted():
+        result.append(parse_expression(parser))
+    return result
+
+def parse_expression(parser):
+    if parser.curr() == "[":
+        return parse_character_class(parser)
+    else:
+        return parse_character(parser)
+
+def parse_character_class(parser):
+    parser.expect("[")
+    beg = parser.consume()
+    parser.expect("-")
+    end = parser.consume()
+    parser.expect("]")
+    if parser.curr() == "{":
+        q = parse_quantifier(parser)
+    return char_class(xs=expand_range(beg, end), q=q)
+
+def parse_quantifier(parser):
+    parser.expect("{")
+    if parser.match([","]):
+        end = parser.consume()
+        parser.expect("}")
+        return quantifier(beg=0, end=end)
+    else:
+        beg = parser.consume()
+        parser.expect(",")
+        if parser.match(["}"]):
+            return quantifier(beg=beg)
+        else:
+            end = parser.consume()
+            parser.expect("}")
+            return quantifier(beg=beg, end=end)
+
+def parse_character(parser):
+    c = parser.consume()
+    q = None
+    if parser.curr() == "{":
+        q = parse_quantifier(parser)
+    return char_class(xs={c}, q=q)
+
+def char_class(xs=set(), q=None):
+    if not q:
+        q = quantifier(beg=1, end=1)
+    return ["CHARACTER_CLASS", xs, q]
+
+def expand_range(beg, end):
+    # TODO: Implement this
+    return {string.printable[i]
+            for i in range(string.printable.index(beg),
+                           string.printable.index(end) + 1)}
+
+def quantifier(beg=0, end=float('inf')):
+    return ['QUANTIFIER', beg, end]
+
+################################################################################
+# Tests
+################################################################################
+
+xs = [
+    ("[a-c]*[0-9]{2,3}", ["dog"]),
+    ("ca+t?", ["cat", "caaaat", "ca", "dog"]),
+]
+
+for re, inputs in xs:
+    print("Regex:  {}".format(re))
+    print("Tokens: {}".format(tokenize(re)))
+    print("Parsed: {}".format(parse(re)))
+    print("\nTESTS")
+    for input in inputs:
+        print("Attempting to match \"{}\"...".format(input))
+        parser = compile(parse(re))
+        print("Result: \"{}\"\n".format(parser(input)))