1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
|
from parser import Parser
# As an exercise to stress-test my understanding of recursive descent parsers,
# I'm attempting to write a JSON parser without referencing any existing BNF
# descriptions of JSON or existing JSON parser implementations.
#
# I'm only parsing a subset of JSON: enough to parse `sample`. Here is the BNF
# that I wrote to describe my expected input:
#
# expression -> object
# object -> '{' ( STRING ':' expression ) ( ',' STRING ':' expression )* '}'
# | array
# array -> '[' expression ( ',' expression )* ']'
# | literal
# literal -> STRING | INT
def tokenize(xs):
"""
Return a list of tokens from the string input, `xs`.
"""
result = []
i = 0
while i < len(xs):
# single characters
if xs[i] in ",{}:[]":
result.append(xs[i])
i += 1
# strings
elif xs[i] == "\"":
curr = xs[i]
i += 1
while xs[i] != "\"":
curr += xs[i]
i += 1
curr += xs[i]
result.append(curr)
i += 1
# integers
elif xs[i] in "0123456789":
curr = xs[i]
i += 1
while xs[i] in "0123456789":
curr += xs[i]
i += 1
result.append(int(curr))
# whitespace
elif xs[i] in {" ", "\n"}:
i += 1
return result
def parse_json(x):
"""
Attempt to parse the string, `x`, into JSON.
"""
tokens = tokenize(x)
return parse_object(Parser(tokens))
def parse_object(parser):
if parser.match(['{']):
key = parse_string(parser)
parser.expect([':'])
value = parse_object(parser)
result = [(key, value)]
while parser.match([',']):
key = parse_string(parser)
parser.match([':'])
value = parse_object(parser)
result.append((key, value))
return result
return parse_array(parser)
def parse_array(parser):
if parser.match(['[']):
if parser.match([']']):
return []
result = [parse_object(parser)]
while parser.match([',']):
result.append(parse_object(parser))
parser.expect([']'])
return result
else:
return parse_literal(parser)
def parse_string(parser):
if parser.curr().startswith("\""):
return parser.consume()
else:
raise Exception("Unexpected token: {}".format(parser.curr()))
def parse_literal(parser):
return parser.consume()
sample = """
{
"glossary": {
"title": "example glossary",
"GlossDiv": {
"title": "S",
"GlossList": {
"GlossEntry": {
"ID": "SGML",
"SortAs": "SGML",
"GlossTerm": "Standard Generalized Markup Language",
"Acronym": "SGML",
"Abbrev": "ISO 8879:1986",
"GlossDef": {
"para": "A meta-markup language, used to create markup languages such as DocBook.",
"GlossSeeAlso": [
"GML",
"XML"
]
},
"GlossSee": "markup"
}
}
}
}
}
"""
print(parse_json(sample))
|