[issue4136] merge json library with latest simplejson 2.0.x - Code Review (original) (raw)
OLD
NEW
1 """Implementation of JSONDecoder
1 """Implementation of JSONDecoder
2 """
2 """
3
4 import re
3 import re
5 import sys
4 import sys
5 import struct
6
6
7 from json.scanner import Scanner, pattern
7 from json.scanner import make_scanner
8 try:
8 try:
9 from _json import scanstring as c_scanstring
9 from _json import scanstring as c_scanstring
10 except ImportError:
10 except ImportError:
11 c_scanstring = None
11 c_scanstring = None
12
12
13 __all__ = ['JSONDecoder']
13 __all__ = ['JSONDecoder']
14
14
15 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
15 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL
16
16
17 NaN, PosInf, NegInf = float('nan'), float('inf'), float('-inf')
17 def _floatconstants():
18 _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')
19 if sys.byteorder != 'big':
20 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]
21 nan, inf = struct.unpack('dd', _BYTES)
22 return nan, inf, -inf
23
24 NaN, PosInf, NegInf = _floatconstants()
18
25
19
26
20 def linecol(doc, pos):
27 def linecol(doc, pos):
21 lineno = doc.count('\n', 0, pos) + 1
28 lineno = doc.count('\n', 0, pos) + 1
22 if lineno == 1:
29 if lineno == 1:
23 colno = pos
30 colno = pos
24 else:
31 else:
25 colno = pos - doc.rindex('\n', 0, pos)
32 colno = pos - doc.rindex('\n', 0, pos)
26 return lineno, colno
33 return lineno, colno
27
34
28
35
29 def errmsg(msg, doc, pos, end=None):
36 def errmsg(msg, doc, pos, end=None):
37 # Note that this function is called from _json
30 lineno, colno = linecol(doc, pos)
38 lineno, colno = linecol(doc, pos)
31 if end is None:
39 if end is None:
32 fmt = '{0}: line {1} column {2} (char {3})'
40 fmt = '{0}: line {1} column {2} (char {3})'
33 return fmt.format(msg, lineno, colno, pos)
41 return fmt.format(msg, lineno, colno, pos)
42 #fmt = '%s: line %d column %d (char %d)'
43 #return fmt % (msg, lineno, colno, pos)
34 endlineno, endcolno = linecol(doc, end)
44 endlineno, endcolno = linecol(doc, end)
35 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
45 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'
36 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
46 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)
47 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'
48 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)
37
49
38
50
39 _CONSTANTS = {
51 _CONSTANTS = {
40 '-Infinity': NegInf,
52 '-Infinity': NegInf,
41 'Infinity': PosInf,
53 'Infinity': PosInf,
42 'NaN': NaN,
54 'NaN': NaN,
43 'true': True,
44 'false': False,
45 'null': None,
46 }
55 }
47
56
48
49 def JSONConstant(match, context, c=_CONSTANTS):
50 s = match.group(0)
51 fn = getattr(context, 'parse_constant', None)
52 if fn is None:
53 rval = c[s]
54 else:
55 rval = fn(s)
56 return rval, None
57 pattern('(-?Infinity|NaN|true|false|null)')(JSONConstant)
58
59
60 def JSONNumber(match, context):
61 match = JSONNumber.regex.match(match.string, *match.span())
62 integer, frac, exp = match.groups()
63 if frac or exp:
64 fn = getattr(context, 'parse_float', None) or float
65 res = fn(integer + (frac or '') + (exp or ''))
66 else:
67 fn = getattr(context, 'parse_int', None) or int
68 res = fn(integer)
69 return res, None
70 pattern(r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?')(JSONNumber)
71
72
73 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
57 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
74 BACKSLASH = {
58 BACKSLASH = {
75 '"': u'"', '\\': u'\\', '/': u'/',
59 '"': u'"', '\\': u'\\', '/': u'/',
76 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
60 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',
77 }
61 }
78
62
79 DEFAULT_ENCODING = "utf-8"
63 DEFAULT_ENCODING = "utf-8"
80
64
81
65 def py_scanstring(s, end, encoding=None, strict=True,
82 def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHU NK.match):
66 _b=BACKSLASH, _m=STRINGCHUNK.match):
67 """Scan the string s for a JSON string. End is the index of the
68 character in s after the quote that started the JSON string.
69 Unescapes all valid JSON string escape sequences and raises ValueError
70 on attempt to decode an invalid string. If strict is False then literal
71 control characters are allowed in the string.
72 ยทยทยทยท
73 Returns a tuple of the decoded string and the index of the character in s
74 after the end quote."""
83 if encoding is None:
75 if encoding is None:
84 encoding = DEFAULT_ENCODING
76 encoding = DEFAULT_ENCODING
85 chunks = []
77 chunks = []
86 _append = chunks.append
78 _append = chunks.append
87 begin = end - 1
79 begin = end - 1
88 while 1:
80 while 1:
89 chunk = _m(s, end)
81 chunk = _m(s, end)
90 if chunk is None:
82 if chunk is None:
91 raise ValueError(
83 raise ValueError(
92 errmsg("Unterminated string starting at", s, begin))
84 errmsg("Unterminated string starting at", s, begin))
93 end = chunk.end()
85 end = chunk.end()
94 content, terminator = chunk.groups()
86 content, terminator = chunk.groups()
87 # Content is contains zero or more unescaped string characters
95 if content:
88 if content:
96 if not isinstance(content, unicode):
89 if not isinstance(content, unicode):
97 content = unicode(content, encoding)
90 content = unicode(content, encoding)
98 _append(content)
91 _append(content)
92 # Terminator is the end of string, a literal control character,
93 # or a backslash denoting that an escape sequence follows
99 if terminator == '"':
94 if terminator == '"':
100 break
95 break
101 elif terminator != '\\':
96 elif terminator != '\\':
102 if strict:
97 if strict:
98 #msg = "Invalid control character %r at" % (terminator,)
103 msg = "Invalid control character {0!r} at".format(terminator)
99 msg = "Invalid control character {0!r} at".format(terminator)
104 raise ValueError(errmsg(msg, s, end))
100 raise ValueError(errmsg(msg, s, end))
105 else:
101 else:
106 _append(terminator)
102 _append(terminator)
107 continue
103 continue
108 try:
104 try:
109 esc = s[end]
105 esc = s[end]
110 except IndexError:
106 except IndexError:
111 raise ValueError(
107 raise ValueError(
112 errmsg("Unterminated string starting at", s, begin))
108 errmsg("Unterminated string starting at", s, begin))
109 # If not a unicode escape sequence, must be in the lookup table
113 if esc != 'u':
110 if esc != 'u':
114 try:
111 try:
115 m = _b[esc]
112 char = _b[esc]
116 except KeyError:
113 except KeyError:
117 msg = "Invalid \\escape: {0!r}".format(esc)
114 msg = "Invalid \\escape: " + repr(esc)
118 raise ValueError(errmsg(msg, s, end))
115 raise ValueError(errmsg(msg, s, end))
119 end += 1
116 end += 1
120 else:
117 else:
118 # Unicode escape sequence
121 esc = s[end + 1:end + 5]
119 esc = s[end + 1:end + 5]
122 next_end = end + 5
120 next_end = end + 5
123 msg = "Invalid \\uXXXX escape"
121 if len(esc) != 4:
124 try:
122 msg = "Invalid \\uXXXX escape"
125 if len(esc) != 4:
126 raise ValueError
127 uni = int(esc, 16)
128 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
129 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
130 if not s[end + 5:end + 7] == '\\u':
131 raise ValueError
132 esc2 = s[end + 7:end + 11]
133 if len(esc2) != 4:
134 raise ValueError
135 uni2 = int(esc2, 16)
136 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
137 next_end += 6
138 m = unichr(uni)
139 except ValueError:
140 raise ValueError(errmsg(msg, s, end))
123 raise ValueError(errmsg(msg, s, end))
124 uni = int(esc, 16)
125 # Check for surrogate pair on UCS-4 systems
126 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:
127 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"
128 if not s[end + 5:end + 7] == '\\u':
129 raise ValueError(errmsg(msg, s, end))
130 esc2 = s[end + 7:end + 11]
131 if len(esc2) != 4:
132 raise ValueError(errmsg(msg, s, end))
133 uni2 = int(esc2, 16)
134 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
135 next_end += 6
136 char = unichr(uni)
141 end = next_end
137 end = next_end
142 _append(m)
138 # Append the unescaped character
139 _append(char)
143 return u''.join(chunks), end
140 return u''.join(chunks), end
144
141
145
142
146 # Use speedup
143 # Use speedup if available
147 if c_scanstring is not None:
144 scanstring = c_scanstring or py_scanstring
148 scanstring = c_scanstring
149 else:
150 scanstring = py_scanstring
151
145
152 def JSONString(match, context):
146 WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)
153 encoding = getattr(context, 'encoding', None)
147 WHITESPACE_STR = ' \t\n\r'
154 strict = getattr(context, 'strict', True)
155 return scanstring(match.string, match.end(), encoding, strict)
156 pattern(r'"')(JSONString)
157
148
158
149 def JSONObject((s, end), encoding, strict, scan_once, object_hook,
159 WHITESPACE = re.compile(r'\s*', FLAGS)
150 _w=WHITESPACE.match, _ws=WHITESPACE_STR):
160
161
162 def JSONObject(match, context, _w=WHITESPACE.match):
163 pairs = {}
151 pairs = {}
164 s = match.string
152 # Use a slice to prevent IndexError from being raised, the following
165 end = _w(s, match.end()).end()
153 # check will raise a more specific ValueError if the string is empty
166 nextchar = s[end:end + 1]
154 nextchar = s[end:end + 1]
167 # Trivial empty object
155 # Normally we expect nextchar == '"'
168 if nextchar == '}':
169 return pairs, end + 1
170 if nextchar != '"':
156 if nextchar != '"':
171 raise ValueError(errmsg("Expecting property name", s, end))
157 if nextchar in _ws:
158 end = _w(s, end).end()
159 nextchar = s[end:end + 1]
160 # Trivial empty object
161 if nextchar == '}':
162 return pairs, end + 1
163 elif nextchar != '"':
164 raise ValueError(errmsg("Expecting property name", s, end))
172 end += 1
165 end += 1
173 encoding = getattr(context, 'encoding', None)
174 strict = getattr(context, 'strict', True)
175 iterscan = JSONScanner.iterscan
176 while True:
166 while True:
177 key, end = scanstring(s, end, encoding, strict)
167 key, end = scanstring(s, end, encoding, strict)
178 end = _w(s, end).end()
168
169 # To skip some function call overhead we optimize the fast paths where
170 # the JSON key separator is ": " or just ":".
179 if s[end:end + 1] != ':':
171 if s[end:end + 1] != ':':
180 raise ValueError(errmsg("Expecting : delimiter", s, end))
172 end = _w(s, end).end()
181 end = _w(s, end + 1).end()
173 if s[end:end + 1] != ':':
174 raise ValueError(errmsg("Expecting : delimiter", s, end))
175
176 end += 1
177
182 try:
178 try:
183 value, end = iterscan(s, idx=end, context=context).next()
179 if s[end] in _ws:
180 end += 1
181 if s[end] in _ws:
182 end = _w(s, end + 1).end()
183 except IndexError:
184 pass
185
186 try:
187 value, end = scan_once(s, end)
184 except StopIteration:
188 except StopIteration:
185 raise ValueError(errmsg("Expecting object", s, end))
189 raise ValueError(errmsg("Expecting object", s, end))
186 pairs[key] = value
190 pairs[key] = value
187 end = _w(s, end).end()
191
188 nextchar = s[end:end + 1]
192 try:
193 nextchar = s[end]
194 if nextchar in _ws:
195 end = _w(s, end + 1).end()
196 nextchar = s[end]
197 except IndexError:
198 nextchar = ''
189 end += 1
199 end += 1
200
190 if nextchar == '}':
201 if nextchar == '}':
191 break
202 break
192 if nextchar != ',':
203 elif nextchar != ',':
193 raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
204 raise ValueError(errmsg("Expecting , delimiter", s, end - 1))
194 end = _w(s, end).end()
205
195 nextchar = s[end:end + 1]
206 try:
207 nextchar = s[end]
208 if nextchar in _ws:
209 end += 1
210 nextchar = s[end]
211 if nextchar in _ws:
212 end = _w(s, end + 1).end()
213 nextchar = s[end]
214 except IndexError:
215 nextchar = ''
216
196 end += 1
217 end += 1
197 if nextchar != '"':
218 if nextchar != '"':
198 raise ValueError(errmsg("Expecting property name", s, end - 1))
219 raise ValueError(errmsg("Expecting property name", s, end - 1))
199 object_hook = getattr(context, 'object_hook', None)
220
200 if object_hook is not None:
221 if object_hook is not None:
201 pairs = object_hook(pairs)
222 pairs = object_hook(pairs)
202 return pairs, end
223 return pairs, end
203 pattern(r'{')(JSONObject)
204
224
205
225 def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
206 def JSONArray(match, context, _w=WHITESPACE.match):
207 values = []
226 values = []
208 s = match.string
227 nextchar = s[end:end + 1]
209 end = _w(s, match.end()).end()
228 if nextchar in _ws:
229 end = _w(s, end + 1).end()
230 nextchar = s[end:end + 1]
210 # Look-ahead for trivial empty array
231 # Look-ahead for trivial empty array
211 nextchar = s[end:end + 1]
212 if nextchar == ']':
232 if nextchar == ']':
213 return values, end + 1
233 return values, end + 1
214 iterscan = JSONScanner.iterscan
234 _append = values.append
215 while True:
235 while True:
216 try:
236 try:
217 value, end = iterscan(s, idx=end, context=context).next()
237 value, end = scan_once(s, end)
218 except StopIteration:
238 except StopIteration:
219 raise ValueError(errmsg("Expecting object", s, end))
239 raise ValueError(errmsg("Expecting object", s, end))
220 values.append(value)
240 _append(value)
221 end = _w(s, end).end()
222 nextchar = s[end:end + 1]
241 nextchar = s[end:end + 1]
242 if nextchar in _ws:
243 end = _w(s, end + 1).end()
244 nextchar = s[end:end + 1]
223 end += 1
245 end += 1
224 if nextchar == ']':
246 if nextchar == ']':
225 break
247 break
226 if nextchar != ',':
248 elif nextchar != ',':
227 raise ValueError(errmsg("Expecting , delimiter", s, end))
249 raise ValueError(errmsg("Expecting , delimiter", s, end))
228 end = _w(s, end).end()
250
251 try:
252 if s[end] in _ws:
253 end += 1
254 if s[end] in _ws:
255 end = _w(s, end + 1).end()
256 except IndexError:
257 pass
258
229 return values, end
259 return values, end
230 pattern(r'\[')(JSONArray)
231
232
233 ANYTHING = [
234 JSONObject,
235 JSONArray,
236 JSONString,
237 JSONConstant,
238 JSONNumber,
239 ]
240
241 JSONScanner = Scanner(ANYTHING)
242
243
260
244 class JSONDecoder(object):
261 class JSONDecoder(object):
245 """Simple JSON http://json.org decoder
262 """Simple JSON http://json.org decoder
246
263
247 Performs the following translations in decoding by default:
264 Performs the following translations in decoding by default:
248
265
249 +---------------+-------------------+
266 +---------------+-------------------+
250 | JSON | Python |
267 | JSON | Python |
251 +===============+===================+
268 +===============+===================+
252 | object | dict |
269 | object | dict |
253 +---------------+-------------------+
270 +---------------+-------------------+
254 | array | list |
271 | array | list |
255 +---------------+-------------------+
272 +---------------+-------------------+
256 | string | unicode |
273 | string | unicode |
257 +---------------+-------------------+
274 +---------------+-------------------+
258 | number (int) | int, long |
275 | number (int) | int, long |
259 +---------------+-------------------+
276 +---------------+-------------------+
260 | number (real) | float |
277 | number (real) | float |
261 +---------------+-------------------+
278 +---------------+-------------------+
262 | true | True |
279 | true | True |
263 +---------------+-------------------+
280 +---------------+-------------------+
264 | false | False |
281 | false | False |
265 +---------------+-------------------+
282 +---------------+-------------------+
266 | null | None |
283 | null | None |
267 +---------------+-------------------+
284 +---------------+-------------------+
268
285
269 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
286 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as
270 their corresponding ``float`` values, which is outside the JSON spec.
287 their corresponding ``float`` values, which is outside the JSON spec.
288
271 """
289 """
272
290
273 _scanner = Scanner(ANYTHING)
274 __all__ = ['__init__', 'decode', 'raw_decode']
275
276 def __init__(self, encoding=None, object_hook=None, parse_float=None,
291 def __init__(self, encoding=None, object_hook=None, parse_float=None,
277 parse_int=None, parse_constant=None, strict=True):
292 parse_int=None, parse_constant=None, strict=True):
278 """``encoding`` determines the encoding used to interpret any ``str``
293 """``encoding`` determines the encoding used to interpret any ``str``
279 objects decoded by this instance (utf-8 by default). It has no
294 objects decoded by this instance (utf-8 by default). It has no
280 effect when decoding ``unicode`` objects.
295 effect when decoding ``unicode`` objects.
281
296
282 Note that currently only encodings that are a superset of ASCII work,
297 Note that currently only encodings that are a superset of ASCII work,
283 strings of other encodings should be passed in as ``unicode``.
298 strings of other encodings should be passed in as ``unicode``.
284
299
285 ``object_hook``, if specified, will be called with the result of
300 ``object_hook``, if specified, will be called with the result
286 every JSON object decoded and its return value will be used in
301 of every JSON object decoded and its return value will be used in
287 place of the given ``dict``. This can be used to provide custom
302 place of the given ``dict``. This can be used to provide custom
288 deserializations (e.g. to support JSON-RPC class hinting).
303 deserializations (e.g. to support JSON-RPC class hinting).
289
304
290 ``parse_float``, if specified, will be called with the string
305 ``parse_float``, if specified, will be called with the string
291 of every JSON float to be decoded. By default this is equivalent to
306 of every JSON float to be decoded. By default this is equivalent to
292 float(num_str). This can be used to use another datatype or parser
307 float(num_str). This can be used to use another datatype or parser
293 for JSON floats (e.g. decimal.Decimal).
308 for JSON floats (e.g. decimal.Decimal).
294
309
295 ``parse_int``, if specified, will be called with the string
310 ``parse_int``, if specified, will be called with the string
296 of every JSON int to be decoded. By default this is equivalent to
311 of every JSON int to be decoded. By default this is equivalent to
297 int(num_str). This can be used to use another datatype or parser
312 int(num_str). This can be used to use another datatype or parser
298 for JSON integers (e.g. float).
313 for JSON integers (e.g. float).
299
314
300 ``parse_constant``, if specified, will be called with one of the
315 ``parse_constant``, if specified, will be called with one of the
301 following strings: -Infinity, Infinity, NaN, null, true, false.
316 following strings: -Infinity, Infinity, NaN.
302 This can be used to raise an exception if invalid JSON numbers
317 This can be used to raise an exception if invalid JSON numbers
303 are encountered.
318 are encountered.
304
319
305 """
320 """
306 self.encoding = encoding
321 self.encoding = encoding
307 self.object_hook = object_hook
322 self.object_hook = object_hook
308 self.parse_float = parse_float
323 self.parse_float = parse_float or float
309 self.parse_int = parse_int
324 self.parse_int = parse_int or int
310 self.parse_constant = parse_constant
325 self.parse_constant = parse_constant or _CONSTANTS.__getitem__
311 self.strict = strict
326 self.strict = strict
327 self.parse_object = JSONObject
328 self.parse_array = JSONArray
329 self.parse_string = scanstring
330 self.scan_once = make_scanner(self)
312
331
313 def decode(self, s, _w=WHITESPACE.match):
332 def decode(self, s, _w=WHITESPACE.match):
314 """
333 """Return the Python representation of ``s`` (a ``str`` or ``unicode``
315 Return the Python representation of ``s`` (a ``str`` or ``unicode``
316 instance containing a JSON document)
334 instance containing a JSON document)
317
335
318 """
336 """
319 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())
320 end = _w(s, end).end()
338 end = _w(s, end).end()
321 if end != len(s):
339 if end != len(s):
322 raise ValueError(errmsg("Extra data", s, end, len(s)))
340 raise ValueError(errmsg("Extra data", s, end, len(s)))
323 return obj
341 return obj
324
342
325 def raw_decode(self, s, **kw):
343 def raw_decode(self, s, idx=0):
326 """Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning
344 """Decode a JSON document from ``s`` (a ``str`` or ``unicode``
327 with a JSON document) and return a 2-tuple of the Python
345 beginning with a JSON document) and return a 2-tuple of the Python
328 representation and the index in ``s`` where the document ended.
346 representation and the index in ``s`` where the document ended.
329
347
330 This can be used to decode a JSON document from a string that may
348 This can be used to decode a JSON document from a string that may
331 have extraneous data at the end.
349 have extraneous data at the end.
332
350
333 """
351 """
334 kw.setdefault('context', self)
335 try:
352 try:
336 obj, end = self._scanner.iterscan(s, **kw).next()
353 obj, end = self.scan_once(s, idx)
337 except StopIteration:
354 except StopIteration:
338 raise ValueError("No JSON object could be decoded")
355 raise ValueError("No JSON object could be decoded")
339 return obj, end
356 return obj, end
OLD
NEW