diff options
author | William Carroll <wpcarro@gmail.com> | 2020-11-14T13·58+0000 |
---|---|---|
committer | William Carroll <wpcarro@gmail.com> | 2020-11-14T14·00+0000 |
commit | c841527f616ffb4a6cd64f941421cf42f54b7d00 (patch) | |
tree | 09264aa849444fa1e648691464e7890427211afb /scratch | |
parent | bfd2180e6bb0cdbcc3214096854416265b2ad8f4 (diff) |
Write encoded XML parser and pretty-printer
Write a function that reads a string of compressed XML and outputs the decompressed version. Note to self: Now that I'm growing more comfortable writing parsers, I'd like to become equally comfortable writing pretty-printers.
Diffstat (limited to 'scratch')
-rw-r--r-- | scratch/facebook/moderate/decompress-xml.py | 98 | ||||
-rw-r--r-- | scratch/facebook/moderate/parser.py | 37 |
2 files changed, 135 insertions, 0 deletions
diff --git a/scratch/facebook/moderate/decompress-xml.py b/scratch/facebook/moderate/decompress-xml.py new file mode 100644 index 000000000000..b22983ed7aff --- /dev/null +++ b/scratch/facebook/moderate/decompress-xml.py @@ -0,0 +1,98 @@ +import string +from parser import Parser + +mapping = { + 1: "family", + 2: "person", + 3: "firstName", + 4: "lastName", + 5: "state", +} + +def parse_int(i, xs): + result = "" + while i < len(xs) and xs[i] in string.digits: + result += xs[i] + i += 1 + return i, int(result) + +def parse_string(i, xs): + result = "" + while xs[i+1] not in string.digits: + result += xs[i] + i += 1 + return i, result + +def tokenize(xs): + result = [] + i = 0 + while i < len(xs): + if xs[i] in string.digits: + i, n = parse_int(i, xs) + result.append(n) + elif xs[i] in string.ascii_letters: + i, x = parse_string(i, xs) + result.append(x) + elif xs[i] == " ": + i += 1 + continue + return result + +def parse(xs): + parser = Parser(tokenize(xs)) + return parse_element(parser) + +# Element -> Tag Attribute* End Element* End ; +# Tag -> INTEGER ; +# Value -> STRING End ; +# Attribute -> Tag Value ; +# End -> 0 ; + +def parse_element(parser): + if type(parser.curr()) == str: + return parser.consume() + tag_id = parser.expect_predicate(lambda x: type(x) == int) + tag = mapping[tag_id] + attrs = parse_attrs(parser) + parser.expect([0]) + children = [] + while not parser.exhausted() and parser.curr() != 0: + children.append(parse_element(parser)) + parser.expect([0]) + return [tag, attrs, children] + +def parse_attrs(parser): + result = [] + while parser.curr() != 0: + tag_id = parser.expect_predicate(lambda x: type(x) == int) + tag = mapping[tag_id] + value = parser.consume() + result.append((tag, value)) + return result + +def stringify_xml(tree, indent=0): + if type(tree) == str: + return tree + result = "" + tag, attrs, children = tree + + str_attrs = [] + for k, v in attrs: + str_attrs.append("{}=\"{}\"".format(k, v)) + str_attrs = (" " if str_attrs else "") + " ".join(str_attrs) + + str_children = [] + for child in children: + str_children.append(" " * 2 * indent + stringify_xml(child, indent + 1)) + str_children = "\n".join(str_children) + + result += "{}<{}{}>\n{}{}\n{}</{}>".format( + " " * 2 * indent, tag, str_attrs, " " * 2 * indent, str_children, + " " * 2 * indent, tag) + return result + +x = "1 4 McDowell 5 CA 0 2 3 Gayle 0 Some Message 0 0" +print("Input: {}".format(x)) +print("Tokens: {}".format(tokenize(x))) +print("Parsed: {}".format(parse(x))) +print("{}".format(stringify_xml(parse(x)))) diff --git a/scratch/facebook/moderate/parser.py b/scratch/facebook/moderate/parser.py new file mode 100644 index 000000000000..57dfb058c037 --- /dev/null +++ b/scratch/facebook/moderate/parser.py @@ -0,0 +1,37 @@ +class Parser(object): + def __init__(self, tokens): + self.tokens = tokens + self.i = 0 + + def prev(self): + return self.tokens[self.i - 1] + + def curr(self): + return self.tokens[self.i] + + def next(self): + return self.tokens[self.i + 1] + + def consume(self): + if not self.exhausted(): + self.i += 1 + return self.prev() + + def match(self, xs): + if not self.exhausted() and self.curr() in xs: + self.consume() + return True + return False + + def expect(self, xs): + if not self.match(xs): + raise Exception("Expected token \"{}\" but received \"{}\"".format(xs, self.curr())) + return self.prev() + + def expect_predicate(self, predicate): + if predicate(self.curr()): + return self.consume() + raise Exception("Expected token \"{}\" to pass predicate, but it did not".format(self.curr())) + + def exhausted(self): + return self.i >= len(self.tokens) |