diff options
Diffstat (limited to 'users/wpcarro/scratch/facebook/parsing/json.py')
-rw-r--r-- | users/wpcarro/scratch/facebook/parsing/json.py | 121 |
1 files changed, 121 insertions, 0 deletions
diff --git a/users/wpcarro/scratch/facebook/parsing/json.py b/users/wpcarro/scratch/facebook/parsing/json.py new file mode 100644 index 000000000000..3975e973fefa --- /dev/null +++ b/users/wpcarro/scratch/facebook/parsing/json.py @@ -0,0 +1,121 @@ +from parser import Parser + +# As an exercise to stress-test my understanding of recursive descent parsers, +# I'm attempting to write a JSON parser without referencing any existing BNF +# descriptions of JSON or existing JSON parser implementations. +# +# I'm only parsing a subset of JSON: enough to parse `sample`. Here is the BNF +# that I wrote to describe my expected input: +# +# expression -> object +# object -> '{' ( STRING ':' expression ) ( ',' STRING ':' expression )* '}' +# | array +# array -> '[' expression ( ',' expression )* ']' +# | literal +# literal -> STRING | INT + +def tokenize(xs): + """ + Return a list of tokens from the string input, `xs`. + """ + result = [] + i = 0 + while i < len(xs): + # single characters + if xs[i] in ",{}:[]": + result.append(xs[i]) + i += 1 + # strings + elif xs[i] == "\"": + curr = xs[i] + i += 1 + while xs[i] != "\"": + curr += xs[i] + i += 1 + curr += xs[i] + result.append(curr) + i += 1 + # integers + elif xs[i] in "0123456789": + curr = xs[i] + i += 1 + while xs[i] in "0123456789": + curr += xs[i] + i += 1 + result.append(int(curr)) + # whitespace + elif xs[i] in {" ", "\n"}: + i += 1 + return result + +def parse_json(x): + """ + Attempt to parse the string, `x`, into JSON. + """ + tokens = tokenize(x) + return parse_object(Parser(tokens)) + +def parse_object(parser): + if parser.match(['{']): + key = parse_string(parser) + parser.expect([':']) + value = parse_object(parser) + result = [(key, value)] + while parser.match([',']): + key = parse_string(parser) + parser.match([':']) + value = parse_object(parser) + result.append((key, value)) + return result + return parse_array(parser) + +def parse_array(parser): + if parser.match(['[']): + if parser.match([']']): + return [] + result = [parse_object(parser)] + while parser.match([',']): + result.append(parse_object(parser)) + parser.expect([']']) + return result + else: + return parse_literal(parser) + +def parse_string(parser): + if parser.curr().startswith("\""): + return parser.consume() + else: + raise Exception("Unexpected token: {}".format(parser.curr())) + +def parse_literal(parser): + return parser.consume() + +sample = """ +{ + "glossary": { + "title": "example glossary", + "GlossDiv": { + "title": "S", + "GlossList": { + "GlossEntry": { + "ID": "SGML", + "SortAs": "SGML", + "GlossTerm": "Standard Generalized Markup Language", + "Acronym": "SGML", + "Abbrev": "ISO 8879:1986", + "GlossDef": { + "para": "A meta-markup language, used to create markup languages such as DocBook.", + "GlossSeeAlso": [ + "GML", + "XML" + ] + }, + "GlossSee": "markup" + } + } + } + } +} +""" + +print(parse_json(sample)) |