about summary refs log tree commit diff
path: root/users/wpcarro/scratch/facebook/parsing/json.py
diff options
context:
space:
mode:
Diffstat (limited to 'users/wpcarro/scratch/facebook/parsing/json.py')
-rw-r--r--users/wpcarro/scratch/facebook/parsing/json.py121
1 files changed, 121 insertions, 0 deletions
diff --git a/users/wpcarro/scratch/facebook/parsing/json.py b/users/wpcarro/scratch/facebook/parsing/json.py
new file mode 100644
index 000000000000..3975e973fefa
--- /dev/null
+++ b/users/wpcarro/scratch/facebook/parsing/json.py
@@ -0,0 +1,121 @@
+from parser import Parser
+
+# As an exercise to stress-test my understanding of recursive descent parsers,
+# I'm attempting to write a JSON parser without referencing any existing BNF
+# descriptions of JSON or existing JSON parser implementations.
+#
+# I'm only parsing a subset of JSON: enough to parse `sample`. Here is the BNF
+# that I wrote to describe my expected input:
+#
+# expression -> object
+# object     -> '{' ( STRING ':' expression ) ( ',' STRING ':' expression )* '}'
+#            |  array
+# array      -> '[' expression ( ',' expression )* ']'
+#            |  literal
+# literal    -> STRING | INT
+
+def tokenize(xs):
+    """
+    Return a list of tokens from the string input, `xs`.
+    """
+    result = []
+    i = 0
+    while i < len(xs):
+        # single characters
+        if xs[i] in ",{}:[]":
+            result.append(xs[i])
+            i += 1
+        # strings
+        elif xs[i] == "\"":
+            curr = xs[i]
+            i += 1
+            while xs[i] != "\"":
+                curr += xs[i]
+                i += 1
+            curr += xs[i]
+            result.append(curr)
+            i += 1
+        # integers
+        elif xs[i] in "0123456789":
+            curr = xs[i]
+            i += 1
+            while xs[i] in "0123456789":
+                curr += xs[i]
+                i += 1
+            result.append(int(curr))
+        # whitespace
+        elif xs[i] in {" ", "\n"}:
+            i += 1
+    return result
+
+def parse_json(x):
+    """
+    Attempt to parse the string, `x`, into JSON.
+    """
+    tokens = tokenize(x)
+    return parse_object(Parser(tokens))
+
+def parse_object(parser):
+    if parser.match(['{']):
+        key = parse_string(parser)
+        parser.expect([':'])
+        value = parse_object(parser)
+        result = [(key, value)]
+        while parser.match([',']):
+            key = parse_string(parser)
+            parser.match([':'])
+            value = parse_object(parser)
+            result.append((key, value))
+        return result
+    return parse_array(parser)
+
+def parse_array(parser):
+    if parser.match(['[']):
+        if parser.match([']']):
+            return []
+        result = [parse_object(parser)]
+        while parser.match([',']):
+            result.append(parse_object(parser))
+        parser.expect([']'])
+        return result
+    else:
+        return parse_literal(parser)
+
+def parse_string(parser):
+    if parser.curr().startswith("\""):
+        return parser.consume()
+    else:
+        raise Exception("Unexpected token: {}".format(parser.curr()))
+
+def parse_literal(parser):
+    return parser.consume()
+
+sample = """
+{
+  "glossary": {
+    "title": "example glossary",
+    "GlossDiv": {
+      "title": "S",
+      "GlossList": {
+        "GlossEntry": {
+          "ID": "SGML",
+          "SortAs": "SGML",
+          "GlossTerm": "Standard Generalized Markup Language",
+          "Acronym": "SGML",
+          "Abbrev": "ISO 8879:1986",
+          "GlossDef": {
+            "para": "A meta-markup language, used to create markup languages such as DocBook.",
+            "GlossSeeAlso": [
+              "GML",
+              "XML"
+            ]
+          },
+          "GlossSee": "markup"
+        }
+      }
+    }
+  }
+}
+"""
+
+print(parse_json(sample))