about summary refs log tree commit diff
path: root/scratch/facebook/moderate/decompress-xml.py
diff options
context:
space:
mode:
authorWilliam Carroll <wpcarro@gmail.com>2020-11-14T13·58+0000
committerWilliam Carroll <wpcarro@gmail.com>2020-11-14T14·00+0000
commitc841527f616ffb4a6cd64f941421cf42f54b7d00 (patch)
tree09264aa849444fa1e648691464e7890427211afb /scratch/facebook/moderate/decompress-xml.py
parentbfd2180e6bb0cdbcc3214096854416265b2ad8f4 (diff)
Write encoded XML parser and pretty-printer
Write a function that reads a string of compressed XML and outputs the
decompressed version.

Note to self: Now that I'm growing more comfortable writing parsers, I'd like to
become equally comfortable writing pretty-printers.
Diffstat (limited to 'scratch/facebook/moderate/decompress-xml.py')
-rw-r--r--scratch/facebook/moderate/decompress-xml.py98
1 files changed, 98 insertions, 0 deletions
diff --git a/scratch/facebook/moderate/decompress-xml.py b/scratch/facebook/moderate/decompress-xml.py
new file mode 100644
index 000000000000..b22983ed7aff
--- /dev/null
+++ b/scratch/facebook/moderate/decompress-xml.py
@@ -0,0 +1,98 @@
+import string
+from parser import Parser
+
+mapping = {
+    1: "family",
+    2: "person",
+    3: "firstName",
+    4: "lastName",
+    5: "state",
+}
+
+def parse_int(i, xs):
+    result = ""
+    while i < len(xs) and xs[i] in string.digits:
+        result += xs[i]
+        i += 1
+    return i, int(result)
+
+def parse_string(i, xs):
+    result = ""
+    while xs[i+1] not in string.digits:
+        result += xs[i]
+        i += 1
+    return i, result
+
+def tokenize(xs):
+    result = []
+    i = 0
+    while i < len(xs):
+        if xs[i] in string.digits:
+            i, n = parse_int(i, xs)
+            result.append(n)
+        elif xs[i] in string.ascii_letters:
+            i, x = parse_string(i, xs)
+            result.append(x)
+        elif xs[i] == " ":
+            i += 1
+            continue
+    return result
+
+def parse(xs):
+    parser = Parser(tokenize(xs))
+    return parse_element(parser)
+
+# Element   -> Tag Attribute* End Element* End ;
+# Tag       -> INTEGER ;
+# Value     -> STRING End ;
+# Attribute -> Tag Value ;
+# End       -> 0 ;
+
+def parse_element(parser):
+    if type(parser.curr()) == str:
+        return parser.consume()
+    tag_id = parser.expect_predicate(lambda x: type(x) == int)
+    tag = mapping[tag_id]
+    attrs = parse_attrs(parser)
+    parser.expect([0])
+    children = []
+    while not parser.exhausted() and parser.curr() != 0:
+        children.append(parse_element(parser))
+    parser.expect([0])
+    return [tag, attrs, children]
+
+def parse_attrs(parser):
+    result = []
+    while parser.curr() != 0:
+        tag_id = parser.expect_predicate(lambda x: type(x) == int)
+        tag = mapping[tag_id]
+        value = parser.consume()
+        result.append((tag, value))
+    return result
+
+def stringify_xml(tree, indent=0):
+    if type(tree) == str:
+        return tree
+    result = ""
+    tag, attrs, children = tree
+
+    str_attrs = []
+    for k, v in attrs:
+        str_attrs.append("{}=\"{}\"".format(k, v))
+    str_attrs = (" " if str_attrs else "") + " ".join(str_attrs)
+
+    str_children = []
+    for child in children:
+        str_children.append(" " * 2 * indent + stringify_xml(child, indent + 1))
+    str_children = "\n".join(str_children)
+
+    result += "{}<{}{}>\n{}{}\n{}</{}>".format(
+        " " * 2 * indent, tag, str_attrs, " " * 2 * indent, str_children,
+        " " * 2 * indent, tag)
+    return result
+
+x = "1 4 McDowell 5 CA 0 2 3 Gayle 0 Some Message 0 0"
+print("Input:   {}".format(x))
+print("Tokens:  {}".format(tokenize(x)))
+print("Parsed:  {}".format(parse(x)))
+print("{}".format(stringify_xml(parse(x))))