cpython: c85305a54e6d (original) (raw)
Mercurial > cpython
changeset 87587:c85305a54e6d 2.7
Issue #11489: JSON decoder now accepts lone surrogates. [#11489]
Serhiy Storchaka storchaka@gmail.com | |
---|---|
date | Tue, 26 Nov 2013 21:25:15 +0200 |
parents | 461a4d10753a |
children | 72951ffbdc76 c8e138646be1 |
files | Lib/json/decoder.py Lib/json/tests/test_scanstring.py Misc/NEWS Modules/_json.c |
diffstat | 4 files changed, 84 insertions(+), 57 deletions(-)[+] [-] Lib/json/decoder.py 35 Lib/json/tests/test_scanstring.py 55 Misc/NEWS 2 Modules/_json.c 49 |
line wrap: on
line diff
--- a/Lib/json/decoder.py +++ b/Lib/json/decoder.py @@ -62,6 +62,16 @@ BACKSLASH = { DEFAULT_ENCODING = "utf-8" +def _decode_uXXXX(s, pos):
- esc = s[pos + 1:pos + 5]
- if len(esc) == 4 and esc[1] not in 'xX':
try:[](#l1.10)
return int(esc, 16)[](#l1.11)
except ValueError:[](#l1.12)
pass[](#l1.13)
- msg = "Invalid \uXXXX escape"
- raise ValueError(errmsg(msg, s, pos))
+ def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match): """Scan the string s for a JSON string. End is the index of the @@ -116,25 +126,16 @@ def py_scanstring(s, end, encoding=None, end += 1 else: # Unicode escape sequence
esc = s[end + 1:end + 5][](#l1.24)
next_end = end + 5[](#l1.25)
if len(esc) != 4:[](#l1.26)
msg = "Invalid \\uXXXX escape"[](#l1.27)
raise ValueError(errmsg(msg, s, end))[](#l1.28)
uni = int(esc, 16)[](#l1.29)
uni = _decode_uXXXX(s, end)[](#l1.30)
end += 5[](#l1.31) # Check for surrogate pair on UCS-4 systems[](#l1.32)
if 0xd800 <= uni <= 0xdbff and sys.maxunicode > 65535:[](#l1.33)
msg = "Invalid \\uXXXX\\uXXXX surrogate pair"[](#l1.34)
if not s[end + 5:end + 7] == '\\u':[](#l1.35)
raise ValueError(errmsg(msg, s, end))[](#l1.36)
esc2 = s[end + 7:end + 11][](#l1.37)
if len(esc2) != 4:[](#l1.38)
raise ValueError(errmsg(msg, s, end))[](#l1.39)
uni2 = int(esc2, 16)[](#l1.40)
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))[](#l1.41)
next_end += 6[](#l1.42)
if sys.maxunicode > 65535 and \[](#l1.43)
0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':[](#l1.44)
uni2 = _decode_uXXXX(s, end + 1)[](#l1.45)
if 0xdc00 <= uni2 <= 0xdfff:[](#l1.46)
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))[](#l1.47)
end += 6[](#l1.48) char = unichr(uni)[](#l1.49)
return u''.join(chunks), endend = next_end[](#l1.50) # Append the unescaped character[](#l1.51) _append(char)[](#l1.52)
--- a/Lib/json/tests/test_scanstring.py +++ b/Lib/json/tests/test_scanstring.py @@ -5,10 +5,6 @@ from json.tests import PyTest, CTest class TestScanstring(object): def test_scanstring(self): scanstring = self.json.decoder.scanstring
self.assertEqual([](#l2.7)
scanstring('"z\\ud834\\udd20x"', 1, None, True),[](#l2.8)
(u'z\U0001d120x', 16))[](#l2.9)
- if sys.maxunicode == 65535: self.assertEqual( scanstring(u'"z\U0001d120x"', 1, None, True), @@ -94,6 +90,57 @@ class TestScanstring(object): scanstring('["Bad value", truth]', 2, None, True), (u'Bad value', 12))
- def test_surrogates(self):
scanstring = self.json.decoder.scanstring[](#l2.19)
def assertScan(given, expect):[](#l2.20)
self.assertEqual(scanstring(given, 1, None, True),[](#l2.21)
(expect, len(given)))[](#l2.22)
if not isinstance(given, unicode):[](#l2.23)
given = unicode(given)[](#l2.24)
self.assertEqual(scanstring(given, 1, None, True),[](#l2.25)
(expect, len(given)))[](#l2.26)
assertScan('"z\\ud834\\u0079x"', u'z\ud834yx')[](#l2.28)
assertScan('"z\\ud834\\udd20x"', u'z\U0001d120x')[](#l2.29)
assertScan('"z\\ud834\\ud834\\udd20x"', u'z\ud834\U0001d120x')[](#l2.30)
assertScan('"z\\ud834x"', u'z\ud834x')[](#l2.31)
assertScan(u'"z\\ud834\udd20x12345"', u'z\ud834\udd20x12345')[](#l2.32)
assertScan('"z\\udd20x"', u'z\udd20x')[](#l2.33)
assertScan(u'"z\ud834\udd20x"', u'z\ud834\udd20x')[](#l2.34)
assertScan(u'"z\ud834\\udd20x"', u'z\ud834\udd20x')[](#l2.35)
assertScan(u'"z\ud834x"', u'z\ud834x')[](#l2.36)
- def test_bad_escapes(self):
scanstring = self.json.decoder.scanstring[](#l2.39)
bad_escapes = [[](#l2.40)
'"\\"',[](#l2.41)
'"\\x"',[](#l2.42)
'"\\u"',[](#l2.43)
'"\\u0"',[](#l2.44)
'"\\u01"',[](#l2.45)
'"\\u012"',[](#l2.46)
'"\\uz012"',[](#l2.47)
'"\\u0z12"',[](#l2.48)
'"\\u01z2"',[](#l2.49)
'"\\u012z"',[](#l2.50)
'"\\u0x12"',[](#l2.51)
'"\\u0X12"',[](#l2.52)
'"\\ud834\\"',[](#l2.53)
'"\\ud834\\u"',[](#l2.54)
'"\\ud834\\ud"',[](#l2.55)
'"\\ud834\\udd"',[](#l2.56)
'"\\ud834\\udd2"',[](#l2.57)
'"\\ud834\\uzdd2"',[](#l2.58)
'"\\ud834\\udzd2"',[](#l2.59)
'"\\ud834\\uddz2"',[](#l2.60)
'"\\ud834\\udd2z"',[](#l2.61)
'"\\ud834\\u0x20"',[](#l2.62)
'"\\ud834\\u0X20"',[](#l2.63)
][](#l2.64)
for s in bad_escapes:[](#l2.65)
with self.assertRaises(ValueError):[](#l2.66)
scanstring(s, 1, None, True)[](#l2.67)
+ def test_issue3623(self): self.assertRaises(ValueError, self.json.decoder.scanstring, b"xxx", 1, "xxx")
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -15,6 +15,8 @@ Core and Builtins Library ------- +- Issue #11489: JSON decoder now accepts lone surrogates. +
- Fix test.test_support.bind_port() to not cause an error when Python was compiled on a system with SO_REUSEPORT defined in the headers but run on a system with an OS kernel that does not support that new socket option.
--- a/Modules/_json.c +++ b/Modules/_json.c @@ -524,16 +524,10 @@ scanstring_str(PyObject pystr, Py_ssize } #ifdef Py_UNICODE_WIDE / Surrogate pair */
if ((c & 0xfc00) == 0xd800) {[](#l4.7)
if ((c & 0xfc00) == 0xd800 && end + 6 < len &&[](#l4.8)
buf[next++] == '\\' &&[](#l4.9)
buf[next++] == 'u') {[](#l4.10) Py_UNICODE c2 = 0;[](#l4.11)
if (end + 6 >= len) {[](#l4.12)
raise_errmsg("Unpaired high surrogate", pystr, end - 5);[](#l4.13)
goto bail;[](#l4.14)
}[](#l4.15)
if (buf[next++] != '\\' || buf[next++] != 'u') {[](#l4.16)
raise_errmsg("Unpaired high surrogate", pystr, end - 5);[](#l4.17)
goto bail;[](#l4.18)
}[](#l4.19) end += 6;[](#l4.20) /* Decode 4 hex digits */[](#l4.21) for (; next < end; next++) {[](#l4.22)
@@ -554,15 +548,10 @@ scanstring_str(PyObject *pystr, Py_ssize goto bail; } }
if ((c2 & 0xfc00) != 0xdc00) {[](#l4.27)
raise_errmsg("Unpaired high surrogate", pystr, end - 5);[](#l4.28)
goto bail;[](#l4.29)
}[](#l4.30)
c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));[](#l4.31)
}[](#l4.32)
else if ((c & 0xfc00) == 0xdc00) {[](#l4.33)
raise_errmsg("Unpaired low surrogate", pystr, end - 5);[](#l4.34)
goto bail;[](#l4.35)
if ((c2 & 0xfc00) == 0xdc00)[](#l4.36)
c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));[](#l4.37)
else[](#l4.38)
end -= 6;[](#l4.39) }[](#l4.40)
#endif } @@ -703,16 +692,9 @@ scanstring_unicode(PyObject pystr, Py_s } #ifdef Py_UNICODE_WIDE / Surrogate pair */
if ((c & 0xfc00) == 0xd800) {[](#l4.47)
if ((c & 0xfc00) == 0xd800 && end + 6 < len &&[](#l4.48)
buf[next++] == '\\' && buf[next++] == 'u') {[](#l4.49) Py_UNICODE c2 = 0;[](#l4.50)
if (end + 6 >= len) {[](#l4.51)
raise_errmsg("Unpaired high surrogate", pystr, end - 5);[](#l4.52)
goto bail;[](#l4.53)
}[](#l4.54)
if (buf[next++] != '\\' || buf[next++] != 'u') {[](#l4.55)
raise_errmsg("Unpaired high surrogate", pystr, end - 5);[](#l4.56)
goto bail;[](#l4.57)
}[](#l4.58) end += 6;[](#l4.59) /* Decode 4 hex digits */[](#l4.60) for (; next < end; next++) {[](#l4.61)
@@ -733,15 +715,10 @@ scanstring_unicode(PyObject *pystr, Py_s goto bail; } }
if ((c2 & 0xfc00) != 0xdc00) {[](#l4.66)
raise_errmsg("Unpaired high surrogate", pystr, end - 5);[](#l4.67)
goto bail;[](#l4.68)
}[](#l4.69)
c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));[](#l4.70)
}[](#l4.71)
else if ((c & 0xfc00) == 0xdc00) {[](#l4.72)
raise_errmsg("Unpaired low surrogate", pystr, end - 5);[](#l4.73)
goto bail;[](#l4.74)
if ((c2 & 0xfc00) == 0xdc00)[](#l4.75)
c = 0x10000 + (((c - 0xd800) << 10) | (c2 - 0xdc00));[](#l4.76)
else[](#l4.77)
end -= 6;[](#l4.78) }[](#l4.79)