[issue4136] merge json library with latest simplejson 2.0.x - Code Review (original) (raw)

OLD

NEW

1 """Implementation of JSONDecoder

1 """Implementation of JSONDecoder

2 """

2 """

3

4 import re

3 import re

5 import sys

4 import sys

5 import struct

6

6

7 from json.scanner import Scanner, pattern

7 from json.scanner import make_scanner

8 try:

8 try:

9 from _json import scanstring as c_scanstring

9 from _json import scanstring as c_scanstring

10 except ImportError:

10 except ImportError:

11 c_scanstring = None

11 c_scanstring = None

12

12

13 __all__ = ['JSONDecoder']

13 __all__ = ['JSONDecoder']

14

14

15 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL

15 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL

16

16

17 NaN, PosInf, NegInf = float('nan'), float('inf'), float('-inf')

17 def _floatconstants():

18 _BYTES = '7FF80000000000007FF0000000000000'.decode('hex')

19 if sys.byteorder != 'big':

20 _BYTES = _BYTES[:8][::-1] + _BYTES[8:][::-1]

21 nan, inf = struct.unpack('dd', _BYTES)

22 return nan, inf, -inf

23

24 NaN, PosInf, NegInf = _floatconstants()

18

25

19

26

20 def linecol(doc, pos):

27 def linecol(doc, pos):

21 lineno = doc.count('\n', 0, pos) + 1

28 lineno = doc.count('\n', 0, pos) + 1

22 if lineno == 1:

29 if lineno == 1:

23 colno = pos

30 colno = pos

24 else:

31 else:

25 colno = pos - doc.rindex('\n', 0, pos)

32 colno = pos - doc.rindex('\n', 0, pos)

26 return lineno, colno

33 return lineno, colno

27

34

28

35

29 def errmsg(msg, doc, pos, end=None):

36 def errmsg(msg, doc, pos, end=None):

37 # Note that this function is called from _json

30 lineno, colno = linecol(doc, pos)

38 lineno, colno = linecol(doc, pos)

31 if end is None:

39 if end is None:

32 fmt = '{0}: line {1} column {2} (char {3})'

40 fmt = '{0}: line {1} column {2} (char {3})'

33 return fmt.format(msg, lineno, colno, pos)

41 return fmt.format(msg, lineno, colno, pos)

42 #fmt = '%s: line %d column %d (char %d)'

43 #return fmt % (msg, lineno, colno, pos)

34 endlineno, endcolno = linecol(doc, end)

44 endlineno, endcolno = linecol(doc, end)

35 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'

45 fmt = '{0}: line {1} column {2} - line {3} column {4} (char {5} - {6})'

36 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)

46 return fmt.format(msg, lineno, colno, endlineno, endcolno, pos, end)

47 #fmt = '%s: line %d column %d - line %d column %d (char %d - %d)'

48 #return fmt % (msg, lineno, colno, endlineno, endcolno, pos, end)

37

49

38

50

39 _CONSTANTS = {

51 _CONSTANTS = {

40 '-Infinity': NegInf,

52 '-Infinity': NegInf,

41 'Infinity': PosInf,

53 'Infinity': PosInf,

42 'NaN': NaN,

54 'NaN': NaN,

43 'true': True,

44 'false': False,

45 'null': None,

46 }

55 }

47

56

48

49 def JSONConstant(match, context, c=_CONSTANTS):

50 s = match.group(0)

51 fn = getattr(context, 'parse_constant', None)

52 if fn is None:

53 rval = c[s]

54 else:

55 rval = fn(s)

56 return rval, None

57 pattern('(-?Infinity|NaN|true|false|null)')(JSONConstant)

58

59

60 def JSONNumber(match, context):

61 match = JSONNumber.regex.match(match.string, *match.span())

62 integer, frac, exp = match.groups()

63 if frac or exp:

64 fn = getattr(context, 'parse_float', None) or float

65 res = fn(integer + (frac or '') + (exp or ''))

66 else:

67 fn = getattr(context, 'parse_int', None) or int

68 res = fn(integer)

69 return res, None

70 pattern(r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?')(JSONNumber)

71

72

73 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)

57 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)

74 BACKSLASH = {

58 BACKSLASH = {

75 '"': u'"', '\\': u'\\', '/': u'/',

59 '"': u'"', '\\': u'\\', '/': u'/',

76 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',

60 'b': u'\b', 'f': u'\f', 'n': u'\n', 'r': u'\r', 't': u'\t',

77 }

61 }

78

62

79 DEFAULT_ENCODING = "utf-8"

63 DEFAULT_ENCODING = "utf-8"

80

64

81

65 def py_scanstring(s, end, encoding=None, strict=True,

82 def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHU NK.match):

66 _b=BACKSLASH, _m=STRINGCHUNK.match):

67 """Scan the string s for a JSON string. End is the index of the

68 character in s after the quote that started the JSON string.

69 Unescapes all valid JSON string escape sequences and raises ValueError

70 on attempt to decode an invalid string. If strict is False then literal

71 control characters are allowed in the string.

72 ยทยทยทยท

73 Returns a tuple of the decoded string and the index of the character in s

74 after the end quote."""

83 if encoding is None:

75 if encoding is None:

84 encoding = DEFAULT_ENCODING

76 encoding = DEFAULT_ENCODING

85 chunks = []

77 chunks = []

86 _append = chunks.append

78 _append = chunks.append

87 begin = end - 1

79 begin = end - 1

88 while 1:

80 while 1:

89 chunk = _m(s, end)

81 chunk = _m(s, end)

90 if chunk is None:

82 if chunk is None:

91 raise ValueError(

83 raise ValueError(

92 errmsg("Unterminated string starting at", s, begin))

84 errmsg("Unterminated string starting at", s, begin))

93 end = chunk.end()

85 end = chunk.end()

94 content, terminator = chunk.groups()

86 content, terminator = chunk.groups()

87 # Content is contains zero or more unescaped string characters

95 if content:

88 if content:

96 if not isinstance(content, unicode):

89 if not isinstance(content, unicode):

97 content = unicode(content, encoding)

90 content = unicode(content, encoding)

98 _append(content)

91 _append(content)

92 # Terminator is the end of string, a literal control character,

93 # or a backslash denoting that an escape sequence follows

99 if terminator == '"':

94 if terminator == '"':

100 break

95 break

101 elif terminator != '\\':

96 elif terminator != '\\':

102 if strict:

97 if strict:

98 #msg = "Invalid control character %r at" % (terminator,)

103 msg = "Invalid control character {0!r} at".format(terminator)

99 msg = "Invalid control character {0!r} at".format(terminator)

104 raise ValueError(errmsg(msg, s, end))

100 raise ValueError(errmsg(msg, s, end))

105 else:

101 else:

106 _append(terminator)

102 _append(terminator)

107 continue

103 continue

108 try:

104 try:

109 esc = s[end]

105 esc = s[end]

110 except IndexError:

106 except IndexError:

111 raise ValueError(

107 raise ValueError(

112 errmsg("Unterminated string starting at", s, begin))

108 errmsg("Unterminated string starting at", s, begin))

109 # If not a unicode escape sequence, must be in the lookup table

113 if esc != 'u':

110 if esc != 'u':

114 try:

111 try:

115 m = _b[esc]

112 char = _b[esc]

116 except KeyError:

113 except KeyError:

117 msg = "Invalid \\escape: {0!r}".format(esc)

114 msg = "Invalid \\escape: " + repr(esc)

118 raise ValueError(errmsg(msg, s, end))

115 raise ValueError(errmsg(msg, s, end))

119 end += 1

116 end += 1

120 else:

117 else:

118 # Unicode escape sequence

121 esc = s[end + 1:end + 5]

119 esc = s[end + 1:end + 5]

122 next_end = end + 5

120 next_end = end + 5

123 msg = "Invalid \\uXXXX escape"

121 if len(esc) != 4:

124 try:

122 msg = "Invalid \\uXXXX escape"

125 if len(esc) != 4:

126 raise ValueError

127 uni = int(esc, 16)

128 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:

129 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"

130 if not s[end + 5:end + 7] == '\\u':

131 raise ValueError

132 esc2 = s[end + 7:end + 11]

133 if len(esc2) != 4:

134 raise ValueError

135 uni2 = int(esc2, 16)

136 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))

137 next_end += 6

138 m = unichr(uni)

139 except ValueError:

140 raise ValueError(errmsg(msg, s, end))

123 raise ValueError(errmsg(msg, s, end))

124 uni = int(esc, 16)

125 # Check for surrogate pair on UCS-4 systems

126 if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:

127 msg = "Invalid \\uXXXX\\uXXXX surrogate pair"

128 if not s[end + 5:end + 7] == '\\u':

129 raise ValueError(errmsg(msg, s, end))

130 esc2 = s[end + 7:end + 11]

131 if len(esc2) != 4:

132 raise ValueError(errmsg(msg, s, end))

133 uni2 = int(esc2, 16)

134 uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))

135 next_end += 6

136 char = unichr(uni)

141 end = next_end

137 end = next_end

142 _append(m)

138 # Append the unescaped character

139 _append(char)

143 return u''.join(chunks), end

140 return u''.join(chunks), end

144

141

145

142

146 # Use speedup

143 # Use speedup if available

147 if c_scanstring is not None:

144 scanstring = c_scanstring or py_scanstring

148 scanstring = c_scanstring

149 else:

150 scanstring = py_scanstring

151

145

152 def JSONString(match, context):

146 WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS)

153 encoding = getattr(context, 'encoding', None)

147 WHITESPACE_STR = ' \t\n\r'

154 strict = getattr(context, 'strict', True)

155 return scanstring(match.string, match.end(), encoding, strict)

156 pattern(r'"')(JSONString)

157

148

158

149 def JSONObject((s, end), encoding, strict, scan_once, object_hook,

159 WHITESPACE = re.compile(r'\s*', FLAGS)

150 _w=WHITESPACE.match, _ws=WHITESPACE_STR):

160

161

162 def JSONObject(match, context, _w=WHITESPACE.match):

163 pairs = {}

151 pairs = {}

164 s = match.string

152 # Use a slice to prevent IndexError from being raised, the following

165 end = _w(s, match.end()).end()

153 # check will raise a more specific ValueError if the string is empty

166 nextchar = s[end:end + 1]

154 nextchar = s[end:end + 1]

167 # Trivial empty object

155 # Normally we expect nextchar == '"'

168 if nextchar == '}':

169 return pairs, end + 1

170 if nextchar != '"':

156 if nextchar != '"':

171 raise ValueError(errmsg("Expecting property name", s, end))

157 if nextchar in _ws:

158 end = _w(s, end).end()

159 nextchar = s[end:end + 1]

160 # Trivial empty object

161 if nextchar == '}':

162 return pairs, end + 1

163 elif nextchar != '"':

164 raise ValueError(errmsg("Expecting property name", s, end))

172 end += 1

165 end += 1

173 encoding = getattr(context, 'encoding', None)

174 strict = getattr(context, 'strict', True)

175 iterscan = JSONScanner.iterscan

176 while True:

166 while True:

177 key, end = scanstring(s, end, encoding, strict)

167 key, end = scanstring(s, end, encoding, strict)

178 end = _w(s, end).end()

168

169 # To skip some function call overhead we optimize the fast paths where

170 # the JSON key separator is ": " or just ":".

179 if s[end:end + 1] != ':':

171 if s[end:end + 1] != ':':

180 raise ValueError(errmsg("Expecting : delimiter", s, end))

172 end = _w(s, end).end()

181 end = _w(s, end + 1).end()

173 if s[end:end + 1] != ':':

174 raise ValueError(errmsg("Expecting : delimiter", s, end))

175

176 end += 1

177

182 try:

178 try:

183 value, end = iterscan(s, idx=end, context=context).next()

179 if s[end] in _ws:

180 end += 1

181 if s[end] in _ws:

182 end = _w(s, end + 1).end()

183 except IndexError:

184 pass

185

186 try:

187 value, end = scan_once(s, end)

184 except StopIteration:

188 except StopIteration:

185 raise ValueError(errmsg("Expecting object", s, end))

189 raise ValueError(errmsg("Expecting object", s, end))

186 pairs[key] = value

190 pairs[key] = value

187 end = _w(s, end).end()

191

188 nextchar = s[end:end + 1]

192 try:

193 nextchar = s[end]

194 if nextchar in _ws:

195 end = _w(s, end + 1).end()

196 nextchar = s[end]

197 except IndexError:

198 nextchar = ''

189 end += 1

199 end += 1

200

190 if nextchar == '}':

201 if nextchar == '}':

191 break

202 break

192 if nextchar != ',':

203 elif nextchar != ',':

193 raise ValueError(errmsg("Expecting , delimiter", s, end - 1))

204 raise ValueError(errmsg("Expecting , delimiter", s, end - 1))

194 end = _w(s, end).end()

205

195 nextchar = s[end:end + 1]

206 try:

207 nextchar = s[end]

208 if nextchar in _ws:

209 end += 1

210 nextchar = s[end]

211 if nextchar in _ws:

212 end = _w(s, end + 1).end()

213 nextchar = s[end]

214 except IndexError:

215 nextchar = ''

216

196 end += 1

217 end += 1

197 if nextchar != '"':

218 if nextchar != '"':

198 raise ValueError(errmsg("Expecting property name", s, end - 1))

219 raise ValueError(errmsg("Expecting property name", s, end - 1))

199 object_hook = getattr(context, 'object_hook', None)

220

200 if object_hook is not None:

221 if object_hook is not None:

201 pairs = object_hook(pairs)

222 pairs = object_hook(pairs)

202 return pairs, end

223 return pairs, end

203 pattern(r'{')(JSONObject)

204

224

205

225 def JSONArray((s, end), scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):

206 def JSONArray(match, context, _w=WHITESPACE.match):

207 values = []

226 values = []

208 s = match.string

227 nextchar = s[end:end + 1]

209 end = _w(s, match.end()).end()

228 if nextchar in _ws:

229 end = _w(s, end + 1).end()

230 nextchar = s[end:end + 1]

210 # Look-ahead for trivial empty array

231 # Look-ahead for trivial empty array

211 nextchar = s[end:end + 1]

212 if nextchar == ']':

232 if nextchar == ']':

213 return values, end + 1

233 return values, end + 1

214 iterscan = JSONScanner.iterscan

234 _append = values.append

215 while True:

235 while True:

216 try:

236 try:

217 value, end = iterscan(s, idx=end, context=context).next()

237 value, end = scan_once(s, end)

218 except StopIteration:

238 except StopIteration:

219 raise ValueError(errmsg("Expecting object", s, end))

239 raise ValueError(errmsg("Expecting object", s, end))

220 values.append(value)

240 _append(value)

221 end = _w(s, end).end()

222 nextchar = s[end:end + 1]

241 nextchar = s[end:end + 1]

242 if nextchar in _ws:

243 end = _w(s, end + 1).end()

244 nextchar = s[end:end + 1]

223 end += 1

245 end += 1

224 if nextchar == ']':

246 if nextchar == ']':

225 break

247 break

226 if nextchar != ',':

248 elif nextchar != ',':

227 raise ValueError(errmsg("Expecting , delimiter", s, end))

249 raise ValueError(errmsg("Expecting , delimiter", s, end))

228 end = _w(s, end).end()

250

251 try:

252 if s[end] in _ws:

253 end += 1

254 if s[end] in _ws:

255 end = _w(s, end + 1).end()

256 except IndexError:

257 pass

258

229 return values, end

259 return values, end

230 pattern(r'\[')(JSONArray)

231

232

233 ANYTHING = [

234 JSONObject,

235 JSONArray,

236 JSONString,

237 JSONConstant,

238 JSONNumber,

239 ]

240

241 JSONScanner = Scanner(ANYTHING)

242

243

260

244 class JSONDecoder(object):

261 class JSONDecoder(object):

245 """Simple JSON http://json.org decoder

262 """Simple JSON http://json.org decoder

246

263

247 Performs the following translations in decoding by default:

264 Performs the following translations in decoding by default:

248

265

249 +---------------+-------------------+

266 +---------------+-------------------+

250 | JSON | Python |

267 | JSON | Python |

251 +===============+===================+

268 +===============+===================+

252 | object | dict |

269 | object | dict |

253 +---------------+-------------------+

270 +---------------+-------------------+

254 | array | list |

271 | array | list |

255 +---------------+-------------------+

272 +---------------+-------------------+

256 | string | unicode |

273 | string | unicode |

257 +---------------+-------------------+

274 +---------------+-------------------+

258 | number (int) | int, long |

275 | number (int) | int, long |

259 +---------------+-------------------+

276 +---------------+-------------------+

260 | number (real) | float |

277 | number (real) | float |

261 +---------------+-------------------+

278 +---------------+-------------------+

262 | true | True |

279 | true | True |

263 +---------------+-------------------+

280 +---------------+-------------------+

264 | false | False |

281 | false | False |

265 +---------------+-------------------+

282 +---------------+-------------------+

266 | null | None |

283 | null | None |

267 +---------------+-------------------+

284 +---------------+-------------------+

268

285

269 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as

286 It also understands ``NaN``, ``Infinity``, and ``-Infinity`` as

270 their corresponding ``float`` values, which is outside the JSON spec.

287 their corresponding ``float`` values, which is outside the JSON spec.

288

271 """

289 """

272

290

273 _scanner = Scanner(ANYTHING)

274 __all__ = ['__init__', 'decode', 'raw_decode']

275

276 def __init__(self, encoding=None, object_hook=None, parse_float=None,

291 def __init__(self, encoding=None, object_hook=None, parse_float=None,

277 parse_int=None, parse_constant=None, strict=True):

292 parse_int=None, parse_constant=None, strict=True):

278 """``encoding`` determines the encoding used to interpret any ``str``

293 """``encoding`` determines the encoding used to interpret any ``str``

279 objects decoded by this instance (utf-8 by default). It has no

294 objects decoded by this instance (utf-8 by default). It has no

280 effect when decoding ``unicode`` objects.

295 effect when decoding ``unicode`` objects.

281

296

282 Note that currently only encodings that are a superset of ASCII work,

297 Note that currently only encodings that are a superset of ASCII work,

283 strings of other encodings should be passed in as ``unicode``.

298 strings of other encodings should be passed in as ``unicode``.

284

299

285 ``object_hook``, if specified, will be called with the result of

300 ``object_hook``, if specified, will be called with the result

286 every JSON object decoded and its return value will be used in

301 of every JSON object decoded and its return value will be used in

287 place of the given ``dict``. This can be used to provide custom

302 place of the given ``dict``. This can be used to provide custom

288 deserializations (e.g. to support JSON-RPC class hinting).

303 deserializations (e.g. to support JSON-RPC class hinting).

289

304

290 ``parse_float``, if specified, will be called with the string

305 ``parse_float``, if specified, will be called with the string

291 of every JSON float to be decoded. By default this is equivalent to

306 of every JSON float to be decoded. By default this is equivalent to

292 float(num_str). This can be used to use another datatype or parser

307 float(num_str). This can be used to use another datatype or parser

293 for JSON floats (e.g. decimal.Decimal).

308 for JSON floats (e.g. decimal.Decimal).

294

309

295 ``parse_int``, if specified, will be called with the string

310 ``parse_int``, if specified, will be called with the string

296 of every JSON int to be decoded. By default this is equivalent to

311 of every JSON int to be decoded. By default this is equivalent to

297 int(num_str). This can be used to use another datatype or parser

312 int(num_str). This can be used to use another datatype or parser

298 for JSON integers (e.g. float).

313 for JSON integers (e.g. float).

299

314

300 ``parse_constant``, if specified, will be called with one of the

315 ``parse_constant``, if specified, will be called with one of the

301 following strings: -Infinity, Infinity, NaN, null, true, false.

316 following strings: -Infinity, Infinity, NaN.

302 This can be used to raise an exception if invalid JSON numbers

317 This can be used to raise an exception if invalid JSON numbers

303 are encountered.

318 are encountered.

304

319

305 """

320 """

306 self.encoding = encoding

321 self.encoding = encoding

307 self.object_hook = object_hook

322 self.object_hook = object_hook

308 self.parse_float = parse_float

323 self.parse_float = parse_float or float

309 self.parse_int = parse_int

324 self.parse_int = parse_int or int

310 self.parse_constant = parse_constant

325 self.parse_constant = parse_constant or _CONSTANTS.__getitem__

311 self.strict = strict

326 self.strict = strict

327 self.parse_object = JSONObject

328 self.parse_array = JSONArray

329 self.parse_string = scanstring

330 self.scan_once = make_scanner(self)

312

331

313 def decode(self, s, _w=WHITESPACE.match):

332 def decode(self, s, _w=WHITESPACE.match):

314 """

333 """Return the Python representation of ``s`` (a ``str`` or ``unicode``

315 Return the Python representation of ``s`` (a ``str`` or ``unicode``

316 instance containing a JSON document)

334 instance containing a JSON document)

317

335

318 """

336 """

319 obj, end = self.raw_decode(s, idx=_w(s, 0).end())

337 obj, end = self.raw_decode(s, idx=_w(s, 0).end())

320 end = _w(s, end).end()

338 end = _w(s, end).end()

321 if end != len(s):

339 if end != len(s):

322 raise ValueError(errmsg("Extra data", s, end, len(s)))

340 raise ValueError(errmsg("Extra data", s, end, len(s)))

323 return obj

341 return obj

324

342

325 def raw_decode(self, s, **kw):

343 def raw_decode(self, s, idx=0):

326 """Decode a JSON document from ``s`` (a ``str`` or ``unicode`` beginning

344 """Decode a JSON document from ``s`` (a ``str`` or ``unicode``

327 with a JSON document) and return a 2-tuple of the Python

345 beginning with a JSON document) and return a 2-tuple of the Python

328 representation and the index in ``s`` where the document ended.

346 representation and the index in ``s`` where the document ended.

329

347

330 This can be used to decode a JSON document from a string that may

348 This can be used to decode a JSON document from a string that may

331 have extraneous data at the end.

349 have extraneous data at the end.

332

350

333 """

351 """

334 kw.setdefault('context', self)

335 try:

352 try:

336 obj, end = self._scanner.iterscan(s, **kw).next()

353 obj, end = self.scan_once(s, idx)

337 except StopIteration:

354 except StopIteration:

338 raise ValueError("No JSON object could be decoded")

355 raise ValueError("No JSON object could be decoded")

339 return obj, end

356 return obj, end

OLD

NEW