diff options
Diffstat (limited to 'users/wpcarro/scratch/facebook/parsing/regex.py')
-rw-r--r-- | users/wpcarro/scratch/facebook/parsing/regex.py | 184 |
1 files changed, 184 insertions, 0 deletions
diff --git a/users/wpcarro/scratch/facebook/parsing/regex.py b/users/wpcarro/scratch/facebook/parsing/regex.py new file mode 100644 index 000000000000..7fc2ef34e2ff --- /dev/null +++ b/users/wpcarro/scratch/facebook/parsing/regex.py @@ -0,0 +1,184 @@ +# Writing a small proof-of-concept... +# - lexer +# - parser +# - compiler +# ...for regex. +# +# BNF +# expression -> ( char_class | CHAR ) quantifier? ( "|" expression )* +# char_class -> "[" CHAR+ "]" +# quantifier -> "?" | "*" | "+" | "{" INT? "," INT? "}" +# +# Of the numerous things I do not support, here are a few items of which I'm +# aware: +# - alternatives: (a|b) +# - capture groups: (ab)cd + +from parser import Parser +import string + +################################################################################ +# Top-Level API +################################################################################ + +def tokenize(xs): + """ + Transform `xs` into a list of tokens. + + Also: expand shorthand symbols using the following table: + - ? -> {0,1} + - * -> {0,} + - + -> {1,} + """ + result = [] + i = 0 + shorthand = { + "?": ["{", 0, ",", 1, "}"], + "*": ["{", 0, ",", "}"], + "+": ["{", 1, ",", "}"], + } + while i < len(xs): + if xs[i] in shorthand: + for c in shorthand[xs[i]]: + result.append(c) + i += 1 + elif xs[i] == "{": + result.append(xs[i]) + i += 1 + curr = "" + while xs[i] in string.digits: + curr += xs[i] + i += 1 + result.append(int(curr)) + assert xs[i] == "," + result.append(",") + i += 1 + curr = "" + while xs[i] in string.digits: + curr += xs[i] + i += 1 + result.append(int(curr)) + else: + result.append(xs[i]) + i += 1 + return result + +def parse(expr): + """ + Tokenize `expr` and convert it into a parse-tree. + """ + tokens = tokenize(expr) + return parse_tokens(tokens) + +def compile(xs): + """ + Transform `xs`, a parse-tree representing a regex, into a function that + accepts a string, and returns the substring that the regex matches. + """ + def fn(input): + match = "" + i = 0 + for x in xs: + matches, q = x[1], x[2] + lo, hi = q[1], q[2] + for j in range(lo): + if i < len(input) and input[i] in matches: + match += input[i] + i += 1 + else: + print("Failed to match {} with {}".format(input[i], matches)) + return None + if hi == float('inf'): + while i < len(input) and input[i] in matches: + match += input[i] + i += 1 + else: + for j in range(hi - lo): + if i < len(input) and input[i] in matches: + match += input[i] + i += 1 + return match + return fn + +################################################################################ +# Helper Functions +################################################################################ + +def parse_tokens(tokens): + result = [] + parser = Parser(tokens) + while not parser.exhausted(): + result.append(parse_expression(parser)) + return result + +def parse_expression(parser): + if parser.curr() == "[": + return parse_character_class(parser) + else: + return parse_character(parser) + +def parse_character_class(parser): + parser.expect("[") + beg = parser.consume() + parser.expect("-") + end = parser.consume() + parser.expect("]") + if parser.curr() == "{": + q = parse_quantifier(parser) + return char_class(xs=expand_range(beg, end), q=q) + +def parse_quantifier(parser): + parser.expect("{") + if parser.match([","]): + end = parser.consume() + parser.expect("}") + return quantifier(beg=0, end=end) + else: + beg = parser.consume() + parser.expect(",") + if parser.match(["}"]): + return quantifier(beg=beg) + else: + end = parser.consume() + parser.expect("}") + return quantifier(beg=beg, end=end) + +def parse_character(parser): + c = parser.consume() + q = None + if parser.curr() == "{": + q = parse_quantifier(parser) + return char_class(xs={c}, q=q) + +def char_class(xs=set(), q=None): + if not q: + q = quantifier(beg=1, end=1) + return ["CHARACTER_CLASS", xs, q] + +def expand_range(beg, end): + # TODO: Implement this + return {string.printable[i] + for i in range(string.printable.index(beg), + string.printable.index(end) + 1)} + +def quantifier(beg=0, end=float('inf')): + return ['QUANTIFIER', beg, end] + +################################################################################ +# Tests +################################################################################ + +xs = [ + ("[a-c]*[0-9]{2,3}", ["dog"]), + ("ca+t?", ["cat", "caaaat", "ca", "dog"]), +] + +for re, inputs in xs: + print("Regex: {}".format(re)) + print("Tokens: {}".format(tokenize(re))) + print("Parsed: {}".format(parse(re))) + print("\nTESTS") + for input in inputs: + print("Attempting to match \"{}\"...".format(input)) + parser = compile(parse(re)) + print("Result: \"{}\"\n".format(parser(input))) |