diff options
Diffstat (limited to 'users/wpcarro/scratch/facebook/parsing')
-rw-r--r-- | users/wpcarro/scratch/facebook/parsing/json.py | 121 | ||||
-rw-r--r-- | users/wpcarro/scratch/facebook/parsing/parser.py | 28 | ||||
-rw-r--r-- | users/wpcarro/scratch/facebook/parsing/regex.py | 184 |
3 files changed, 333 insertions, 0 deletions
diff --git a/users/wpcarro/scratch/facebook/parsing/json.py b/users/wpcarro/scratch/facebook/parsing/json.py new file mode 100644 index 000000000000..3975e973fefa --- /dev/null +++ b/users/wpcarro/scratch/facebook/parsing/json.py @@ -0,0 +1,121 @@ +from parser import Parser + +# As an exercise to stress-test my understanding of recursive descent parsers, +# I'm attempting to write a JSON parser without referencing any existing BNF +# descriptions of JSON or existing JSON parser implementations. +# +# I'm only parsing a subset of JSON: enough to parse `sample`. Here is the BNF +# that I wrote to describe my expected input: +# +# expression -> object +# object -> '{' ( STRING ':' expression ) ( ',' STRING ':' expression )* '}' +# | array +# array -> '[' expression ( ',' expression )* ']' +# | literal +# literal -> STRING | INT + +def tokenize(xs): + """ + Return a list of tokens from the string input, `xs`. + """ + result = [] + i = 0 + while i < len(xs): + # single characters + if xs[i] in ",{}:[]": + result.append(xs[i]) + i += 1 + # strings + elif xs[i] == "\"": + curr = xs[i] + i += 1 + while xs[i] != "\"": + curr += xs[i] + i += 1 + curr += xs[i] + result.append(curr) + i += 1 + # integers + elif xs[i] in "0123456789": + curr = xs[i] + i += 1 + while xs[i] in "0123456789": + curr += xs[i] + i += 1 + result.append(int(curr)) + # whitespace + elif xs[i] in {" ", "\n"}: + i += 1 + return result + +def parse_json(x): + """ + Attempt to parse the string, `x`, into JSON. + """ + tokens = tokenize(x) + return parse_object(Parser(tokens)) + +def parse_object(parser): + if parser.match(['{']): + key = parse_string(parser) + parser.expect([':']) + value = parse_object(parser) + result = [(key, value)] + while parser.match([',']): + key = parse_string(parser) + parser.match([':']) + value = parse_object(parser) + result.append((key, value)) + return result + return parse_array(parser) + +def parse_array(parser): + if parser.match(['[']): + if parser.match([']']): + return [] + result = [parse_object(parser)] + while parser.match([',']): + result.append(parse_object(parser)) + parser.expect([']']) + return result + else: + return parse_literal(parser) + +def parse_string(parser): + if parser.curr().startswith("\""): + return parser.consume() + else: + raise Exception("Unexpected token: {}".format(parser.curr())) + +def parse_literal(parser): + return parser.consume() + +sample = """ +{ + "glossary": { + "title": "example glossary", + "GlossDiv": { + "title": "S", + "GlossList": { + "GlossEntry": { + "ID": "SGML", + "SortAs": "SGML", + "GlossTerm": "Standard Generalized Markup Language", + "Acronym": "SGML", + "Abbrev": "ISO 8879:1986", + "GlossDef": { + "para": "A meta-markup language, used to create markup languages such as DocBook.", + "GlossSeeAlso": [ + "GML", + "XML" + ] + }, + "GlossSee": "markup" + } + } + } + } +} +""" + +print(parse_json(sample)) diff --git a/users/wpcarro/scratch/facebook/parsing/parser.py b/users/wpcarro/scratch/facebook/parsing/parser.py new file mode 100644 index 000000000000..407bff61c980 --- /dev/null +++ b/users/wpcarro/scratch/facebook/parsing/parser.py @@ -0,0 +1,28 @@ +class Parser(object): + def __init__(self, tokens): + self.tokens = tokens + self.i = 0 + + def prev(self): + return self.tokens[self.i - 1] + + def curr(self): + return self.tokens[self.i] + + def consume(self): + if not self.exhausted(): + self.i += 1 + return self.prev() + + def match(self, xs): + if not self.exhausted() and self.curr() in xs: + self.consume() + return True + return False + + def expect(self, xs): + if not self.match(xs): + raise Exception("Expected token \"{}\" but received \"{}\"".format(xs, self.curr())) + + def exhausted(self): + return self.i >= len(self.tokens) diff --git a/users/wpcarro/scratch/facebook/parsing/regex.py b/users/wpcarro/scratch/facebook/parsing/regex.py new file mode 100644 index 000000000000..7fc2ef34e2ff --- /dev/null +++ b/users/wpcarro/scratch/facebook/parsing/regex.py @@ -0,0 +1,184 @@ +# Writing a small proof-of-concept... +# - lexer +# - parser +# - compiler +# ...for regex. +# +# BNF +# expression -> ( char_class | CHAR ) quantifier? ( "|" expression )* +# char_class -> "[" CHAR+ "]" +# quantifier -> "?" | "*" | "+" | "{" INT? "," INT? "}" +# +# Of the numerous things I do not support, here are a few items of which I'm +# aware: +# - alternatives: (a|b) +# - capture groups: (ab)cd + +from parser import Parser +import string + +################################################################################ +# Top-Level API +################################################################################ + +def tokenize(xs): + """ + Transform `xs` into a list of tokens. + + Also: expand shorthand symbols using the following table: + - ? -> {0,1} + - * -> {0,} + - + -> {1,} + """ + result = [] + i = 0 + shorthand = { + "?": ["{", 0, ",", 1, "}"], + "*": ["{", 0, ",", "}"], + "+": ["{", 1, ",", "}"], + } + while i < len(xs): + if xs[i] in shorthand: + for c in shorthand[xs[i]]: + result.append(c) + i += 1 + elif xs[i] == "{": + result.append(xs[i]) + i += 1 + curr = "" + while xs[i] in string.digits: + curr += xs[i] + i += 1 + result.append(int(curr)) + assert xs[i] == "," + result.append(",") + i += 1 + curr = "" + while xs[i] in string.digits: + curr += xs[i] + i += 1 + result.append(int(curr)) + else: + result.append(xs[i]) + i += 1 + return result + +def parse(expr): + """ + Tokenize `expr` and convert it into a parse-tree. + """ + tokens = tokenize(expr) + return parse_tokens(tokens) + +def compile(xs): + """ + Transform `xs`, a parse-tree representing a regex, into a function that + accepts a string, and returns the substring that the regex matches. + """ + def fn(input): + match = "" + i = 0 + for x in xs: + matches, q = x[1], x[2] + lo, hi = q[1], q[2] + for j in range(lo): + if i < len(input) and input[i] in matches: + match += input[i] + i += 1 + else: + print("Failed to match {} with {}".format(input[i], matches)) + return None + if hi == float('inf'): + while i < len(input) and input[i] in matches: + match += input[i] + i += 1 + else: + for j in range(hi - lo): + if i < len(input) and input[i] in matches: + match += input[i] + i += 1 + return match + return fn + +################################################################################ +# Helper Functions +################################################################################ + +def parse_tokens(tokens): + result = [] + parser = Parser(tokens) + while not parser.exhausted(): + result.append(parse_expression(parser)) + return result + +def parse_expression(parser): + if parser.curr() == "[": + return parse_character_class(parser) + else: + return parse_character(parser) + +def parse_character_class(parser): + parser.expect("[") + beg = parser.consume() + parser.expect("-") + end = parser.consume() + parser.expect("]") + if parser.curr() == "{": + q = parse_quantifier(parser) + return char_class(xs=expand_range(beg, end), q=q) + +def parse_quantifier(parser): + parser.expect("{") + if parser.match([","]): + end = parser.consume() + parser.expect("}") + return quantifier(beg=0, end=end) + else: + beg = parser.consume() + parser.expect(",") + if parser.match(["}"]): + return quantifier(beg=beg) + else: + end = parser.consume() + parser.expect("}") + return quantifier(beg=beg, end=end) + +def parse_character(parser): + c = parser.consume() + q = None + if parser.curr() == "{": + q = parse_quantifier(parser) + return char_class(xs={c}, q=q) + +def char_class(xs=set(), q=None): + if not q: + q = quantifier(beg=1, end=1) + return ["CHARACTER_CLASS", xs, q] + +def expand_range(beg, end): + # TODO: Implement this + return {string.printable[i] + for i in range(string.printable.index(beg), + string.printable.index(end) + 1)} + +def quantifier(beg=0, end=float('inf')): + return ['QUANTIFIER', beg, end] + +################################################################################ +# Tests +################################################################################ + +xs = [ + ("[a-c]*[0-9]{2,3}", ["dog"]), + ("ca+t?", ["cat", "caaaat", "ca", "dog"]), +] + +for re, inputs in xs: + print("Regex: {}".format(re)) + print("Tokens: {}".format(tokenize(re))) + print("Parsed: {}".format(parse(re))) + print("\nTESTS") + for input in inputs: + print("Attempting to match \"{}\"...".format(input)) + parser = compile(parse(re)) + print("Result: \"{}\"\n".format(parser(input))) |