about summary refs log tree commit diff
path: root/users/wpcarro/scratch/facebook/parsing/regex.py
diff options
context:
space:
mode:
Diffstat (limited to 'users/wpcarro/scratch/facebook/parsing/regex.py')
-rw-r--r--users/wpcarro/scratch/facebook/parsing/regex.py184
1 files changed, 184 insertions, 0 deletions
diff --git a/users/wpcarro/scratch/facebook/parsing/regex.py b/users/wpcarro/scratch/facebook/parsing/regex.py
new file mode 100644
index 0000000000..7fc2ef34e2
--- /dev/null
+++ b/users/wpcarro/scratch/facebook/parsing/regex.py
@@ -0,0 +1,184 @@
+# Writing a small proof-of-concept...
+#   - lexer
+#   - parser
+#   - compiler
+# ...for regex.
+#
+# BNF
+# expression -> ( char_class | CHAR ) quantifier? ( "|" expression )*
+# char_class -> "[" CHAR+ "]"
+# quantifier -> "?" | "*" | "+" | "{" INT? "," INT? "}"
+#
+# Of the numerous things I do not support, here are a few items of which I'm
+# aware:
+#   - alternatives:   (a|b)
+#   - capture groups: (ab)cd
+
+from parser import Parser
+import string
+
+################################################################################
+# Top-Level API
+################################################################################
+
+def tokenize(xs):
+    """
+    Transform `xs` into a list of tokens.
+
+    Also: expand shorthand symbols using the following table:
+      - ? -> {0,1}
+      - * -> {0,}
+      - + -> {1,}
+    """
+    result = []
+    i = 0
+    shorthand = {
+        "?": ["{", 0, ",", 1, "}"],
+        "*": ["{", 0, ",", "}"],
+        "+": ["{", 1, ",", "}"],
+    }
+    while i < len(xs):
+        if xs[i] in shorthand:
+            for c in shorthand[xs[i]]:
+                result.append(c)
+            i += 1
+        elif xs[i] == "{":
+            result.append(xs[i])
+            i += 1
+            curr = ""
+            while xs[i] in string.digits:
+                curr += xs[i]
+                i += 1
+            result.append(int(curr))
+            assert xs[i] == ","
+            result.append(",")
+            i += 1
+            curr = ""
+            while xs[i] in string.digits:
+                curr += xs[i]
+                i += 1
+            result.append(int(curr))
+        else:
+            result.append(xs[i])
+            i += 1
+    return result
+
+def parse(expr):
+    """
+    Tokenize `expr` and convert it into a parse-tree.
+    """
+    tokens = tokenize(expr)
+    return parse_tokens(tokens)
+
+def compile(xs):
+    """
+    Transform `xs`, a parse-tree representing a regex, into a function that
+    accepts a string, and returns the substring that the regex matches.
+    """
+    def fn(input):
+        match = ""
+        i = 0
+        for x in xs:
+            matches, q = x[1], x[2]
+            lo, hi = q[1], q[2]
+            for j in range(lo):
+                if i < len(input) and input[i] in matches:
+                    match += input[i]
+                    i += 1
+                else:
+                    print("Failed to match {} with {}".format(input[i], matches))
+                    return None
+            if hi == float('inf'):
+                while i < len(input) and input[i] in matches:
+                    match += input[i]
+                    i += 1
+            else:
+                for j in range(hi - lo):
+                    if i < len(input) and input[i] in matches:
+                        match += input[i]
+                        i += 1
+        return match
+    return fn
+
+################################################################################
+# Helper Functions
+################################################################################
+
+def parse_tokens(tokens):
+    result = []
+    parser = Parser(tokens)
+    while not parser.exhausted():
+        result.append(parse_expression(parser))
+    return result
+
+def parse_expression(parser):
+    if parser.curr() == "[":
+        return parse_character_class(parser)
+    else:
+        return parse_character(parser)
+
+def parse_character_class(parser):
+    parser.expect("[")
+    beg = parser.consume()
+    parser.expect("-")
+    end = parser.consume()
+    parser.expect("]")
+    if parser.curr() == "{":
+        q = parse_quantifier(parser)
+    return char_class(xs=expand_range(beg, end), q=q)
+
+def parse_quantifier(parser):
+    parser.expect("{")
+    if parser.match([","]):
+        end = parser.consume()
+        parser.expect("}")
+        return quantifier(beg=0, end=end)
+    else:
+        beg = parser.consume()
+        parser.expect(",")
+        if parser.match(["}"]):
+            return quantifier(beg=beg)
+        else:
+            end = parser.consume()
+            parser.expect("}")
+            return quantifier(beg=beg, end=end)
+
+def parse_character(parser):
+    c = parser.consume()
+    q = None
+    if parser.curr() == "{":
+        q = parse_quantifier(parser)
+    return char_class(xs={c}, q=q)
+
+def char_class(xs=set(), q=None):
+    if not q:
+        q = quantifier(beg=1, end=1)
+    return ["CHARACTER_CLASS", xs, q]
+
+def expand_range(beg, end):
+    # TODO: Implement this
+    return {string.printable[i]
+            for i in range(string.printable.index(beg),
+                           string.printable.index(end) + 1)}
+
+def quantifier(beg=0, end=float('inf')):
+    return ['QUANTIFIER', beg, end]
+
+################################################################################
+# Tests
+################################################################################
+
+xs = [
+    ("[a-c]*[0-9]{2,3}", ["dog"]),
+    ("ca+t?", ["cat", "caaaat", "ca", "dog"]),
+]
+
+for re, inputs in xs:
+    print("Regex:  {}".format(re))
+    print("Tokens: {}".format(tokenize(re)))
+    print("Parsed: {}".format(parse(re)))
+    print("\nTESTS")
+    for input in inputs:
+        print("Attempting to match \"{}\"...".format(input))
+        parser = compile(parse(re))
+        print("Result: \"{}\"\n".format(parser(input)))