diff options
author | Vincent Ambo <mail@tazj.in> | 2021-12-13T22·51+0300 |
---|---|---|
committer | Vincent Ambo <mail@tazj.in> | 2021-12-13T23·15+0300 |
commit | 019f8fd2113df4c5247c3969c60fd4f0e08f91f7 (patch) | |
tree | 76a857f61aa88f62a30e854651e8439db77fd0ea /users/wpcarro/scratch/facebook/parsing/json.py | |
parent | 464bbcb15c09813172c79820bcf526bb10cf4208 (diff) | |
parent | 6123e976928ca3d8d93f0b2006b10b5f659eb74d (diff) |
subtree(users/wpcarro): docking briefcase at '24f5a642' r/3226
git-subtree-dir: users/wpcarro git-subtree-mainline: 464bbcb15c09813172c79820bcf526bb10cf4208 git-subtree-split: 24f5a642af3aa1627bbff977f0a101907a02c69f Change-Id: I6105b3762b79126b3488359c95978cadb3efa789
Diffstat (limited to 'users/wpcarro/scratch/facebook/parsing/json.py')
-rw-r--r-- | users/wpcarro/scratch/facebook/parsing/json.py | 121 |
1 files changed, 121 insertions, 0 deletions
diff --git a/users/wpcarro/scratch/facebook/parsing/json.py b/users/wpcarro/scratch/facebook/parsing/json.py new file mode 100644 index 000000000000..3975e973fefa --- /dev/null +++ b/users/wpcarro/scratch/facebook/parsing/json.py @@ -0,0 +1,121 @@ +from parser import Parser + +# As an exercise to stress-test my understanding of recursive descent parsers, +# I'm attempting to write a JSON parser without referencing any existing BNF +# descriptions of JSON or existing JSON parser implementations. +# +# I'm only parsing a subset of JSON: enough to parse `sample`. Here is the BNF +# that I wrote to describe my expected input: +# +# expression -> object +# object -> '{' ( STRING ':' expression ) ( ',' STRING ':' expression )* '}' +# | array +# array -> '[' expression ( ',' expression )* ']' +# | literal +# literal -> STRING | INT + +def tokenize(xs): + """ + Return a list of tokens from the string input, `xs`. + """ + result = [] + i = 0 + while i < len(xs): + # single characters + if xs[i] in ",{}:[]": + result.append(xs[i]) + i += 1 + # strings + elif xs[i] == "\"": + curr = xs[i] + i += 1 + while xs[i] != "\"": + curr += xs[i] + i += 1 + curr += xs[i] + result.append(curr) + i += 1 + # integers + elif xs[i] in "0123456789": + curr = xs[i] + i += 1 + while xs[i] in "0123456789": + curr += xs[i] + i += 1 + result.append(int(curr)) + # whitespace + elif xs[i] in {" ", "\n"}: + i += 1 + return result + +def parse_json(x): + """ + Attempt to parse the string, `x`, into JSON. + """ + tokens = tokenize(x) + return parse_object(Parser(tokens)) + +def parse_object(parser): + if parser.match(['{']): + key = parse_string(parser) + parser.expect([':']) + value = parse_object(parser) + result = [(key, value)] + while parser.match([',']): + key = parse_string(parser) + parser.match([':']) + value = parse_object(parser) + result.append((key, value)) + return result + return parse_array(parser) + +def parse_array(parser): + if parser.match(['[']): + if parser.match([']']): + return [] + result = [parse_object(parser)] + while parser.match([',']): + result.append(parse_object(parser)) + parser.expect([']']) + return result + else: + return parse_literal(parser) + +def parse_string(parser): + if parser.curr().startswith("\""): + return parser.consume() + else: + raise Exception("Unexpected token: {}".format(parser.curr())) + +def parse_literal(parser): + return parser.consume() + +sample = """ +{ + "glossary": { + "title": "example glossary", + "GlossDiv": { + "title": "S", + "GlossList": { + "GlossEntry": { + "ID": "SGML", + "SortAs": "SGML", + "GlossTerm": "Standard Generalized Markup Language", + "Acronym": "SGML", + "Abbrev": "ISO 8879:1986", + "GlossDef": { + "para": "A meta-markup language, used to create markup languages such as DocBook.", + "GlossSeeAlso": [ + "GML", + "XML" + ] + }, + "GlossSee": "markup" + } + } + } + } +} +""" + +print(parse_json(sample)) |