cpython: 218da678bb8b (original) (raw)
Mercurial > cpython
changeset 81806:218da678bb8b
Issue #16979: Fix error handling bugs in the unicode-escape-decode decoder. [#16979]
Serhiy Storchaka storchaka@gmail.com | |
---|---|
date | Tue, 29 Jan 2013 10:37:39 +0200 |
parents | 8b9910d8d27f(current diff)086defaf16fe(diff) |
children | d2f502cf12f4 |
files | Lib/test/test_codecs.py Misc/NEWS Objects/unicodeobject.c |
diffstat | 4 files changed, 117 insertions(+), 53 deletions(-)[+] [-] Lib/test/test_codeccallbacks.py 4 Lib/test/test_codecs.py 84 Misc/NEWS 2 Objects/unicodeobject.c 80 |
line wrap: on
line diff
--- a/Lib/test/test_codeccallbacks.py +++ b/Lib/test/test_codeccallbacks.py @@ -271,12 +271,12 @@ class CodecCallbackTest(unittest.TestCas self.assertEqual( b"\u3042\u3xxx".decode("unicode-escape", "test.handler1"),
"\u3042[<92><117><51><120>]xx"[](#l1.7)
"\u3042[<92><117><51>]xxx"[](#l1.8) )[](#l1.9)
self.assertEqual( b"\u3042\u3xx".decode("unicode-escape", "test.handler1"),
"\u3042[<92><117><51><120><120>]"[](#l1.13)
"\u3042[<92><117><51>]xx"[](#l1.14) )[](#l1.15)
--- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -21,6 +21,11 @@ except ImportError: else: SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar) +def coding_checker(self, coder):
- def check(input, expect):
self.assertEqual(coder(input), (expect, len(input)))[](#l2.9)
- return check
+ class Queue(object): """ queue: write bytes at one end, read bytes from the other end @@ -2009,6 +2014,85 @@ class TypesTest(unittest.TestCase): self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000") self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10)) + +class UnicodeEscapeTest(unittest.TestCase):
- def test_empty(self):
self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))[](#l2.22)
self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))[](#l2.23)
- def test_raw_encode(self):
encode = codecs.unicode_escape_encode[](#l2.26)
for b in range(32, 127):[](#l2.27)
if b != b'\\'[0]:[](#l2.28)
self.assertEqual(encode(chr(b)), (bytes([b]), 1))[](#l2.29)
- def test_raw_decode(self):
decode = codecs.unicode_escape_decode[](#l2.32)
for b in range(256):[](#l2.33)
if b != b'\\'[0]:[](#l2.34)
self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))[](#l2.35)
- def test_escape_encode(self):
encode = codecs.unicode_escape_encode[](#l2.38)
check = coding_checker(self, encode)[](#l2.39)
check('\t', br'\t')[](#l2.40)
check('\n', br'\n')[](#l2.41)
check('\r', br'\r')[](#l2.42)
check('\\', br'\\')[](#l2.43)
for b in range(32):[](#l2.44)
if chr(b) not in '\t\n\r':[](#l2.45)
check(chr(b), ('\\x%02x' % b).encode())[](#l2.46)
for b in range(127, 256):[](#l2.47)
check(chr(b), ('\\x%02x' % b).encode())[](#l2.48)
check('\u20ac', br'\u20ac')[](#l2.49)
check('\U0001d120', br'\U0001d120')[](#l2.50)
- def test_escape_decode(self):
decode = codecs.unicode_escape_decode[](#l2.53)
check = coding_checker(self, decode)[](#l2.54)
check(b"[\\\n]", "[]")[](#l2.55)
check(br'[\"]', '["]')[](#l2.56)
check(br"[\']", "[']")[](#l2.57)
check(br"[\\]", r"[\]")[](#l2.58)
check(br"[\a]", "[\x07]")[](#l2.59)
check(br"[\b]", "[\x08]")[](#l2.60)
check(br"[\t]", "[\x09]")[](#l2.61)
check(br"[\n]", "[\x0a]")[](#l2.62)
check(br"[\v]", "[\x0b]")[](#l2.63)
check(br"[\f]", "[\x0c]")[](#l2.64)
check(br"[\r]", "[\x0d]")[](#l2.65)
check(br"[\7]", "[\x07]")[](#l2.66)
check(br"[\8]", r"[\8]")[](#l2.67)
check(br"[\78]", "[\x078]")[](#l2.68)
check(br"[\41]", "[!]")[](#l2.69)
check(br"[\418]", "[!8]")[](#l2.70)
check(br"[\101]", "[A]")[](#l2.71)
check(br"[\1010]", "[A0]")[](#l2.72)
check(br"[\x41]", "[A]")[](#l2.73)
check(br"[\x410]", "[A0]")[](#l2.74)
check(br"\u20ac", "\u20ac")[](#l2.75)
check(br"\U0001d120", "\U0001d120")[](#l2.76)
for b in range(256):[](#l2.77)
if b not in b'\n"\'\\abtnvfr01234567xuUN':[](#l2.78)
check(b'\\' + bytes([b]), '\\' + chr(b))[](#l2.79)
- def test_decode_errors(self):
decode = codecs.unicode_escape_decode[](#l2.82)
for c, d in (b'x', 2), (b'u', 4), (b'U', 4):[](#l2.83)
for i in range(d):[](#l2.84)
self.assertRaises(UnicodeDecodeError, decode,[](#l2.85)
b"\\" + c + b"0"*i)[](#l2.86)
self.assertRaises(UnicodeDecodeError, decode,[](#l2.87)
b"[\\" + c + b"0"*i + b"]")[](#l2.88)
data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i[](#l2.89)
self.assertEqual(decode(data, "ignore"), ("[]", len(data)))[](#l2.90)
self.assertEqual(decode(data, "replace"),[](#l2.91)
("[\ufffd]\ufffd", len(data)))[](#l2.92)
self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")[](#l2.93)
self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))[](#l2.94)
self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))[](#l2.95)
+ + class SurrogateEscapeTest(unittest.TestCase): def test_utf8(self):
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -234,6 +234,8 @@ Core and Builtins Library ------- +- Issue #16979: Fix error handling bugs in the unicode-escape-decode decoder. + Have py_compile use importlib as much as possible to avoid code duplication.
--- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -5378,7 +5378,6 @@ PyUnicode_DecodeUnicodeEscape(const char const char *starts = s; Py_ssize_t startinpos; Py_ssize_t endinpos;
- int j; _PyUnicodeWriter writer; const char end; char message; @@ -5500,28 +5499,19 @@ PyUnicode_DecodeUnicodeEscape(const char message = "truncated \UXXXXXXXX escape"; hexescape: chr = 0;
if (s+digits>end) {[](#l4.15)
endinpos = size;[](#l4.16)
if (unicode_decode_call_errorhandler_writer([](#l4.17)
errors, &errorHandler,[](#l4.18)
"unicodeescape", "end of string in escape sequence",[](#l4.19)
&starts, &end, &startinpos, &endinpos, &exc, &s,[](#l4.20)
&writer))[](#l4.21)
goto onError;[](#l4.22)
goto nextByte;[](#l4.23)
}[](#l4.24)
for (j = 0; j < digits; ++j) {[](#l4.25)
c = (unsigned char) s[j];[](#l4.26)
if (!Py_ISXDIGIT(c)) {[](#l4.27)
endinpos = (s+j+1)-starts;[](#l4.28)
if (unicode_decode_call_errorhandler_writer([](#l4.29)
errors, &errorHandler,[](#l4.30)
"unicodeescape", message,[](#l4.31)
&starts, &end, &startinpos, &endinpos, &exc, &s,[](#l4.32)
&writer))[](#l4.33)
goto onError;[](#l4.34)
goto nextByte;[](#l4.35)
if (end - s < digits) {[](#l4.36)
/* count only hex digits */[](#l4.37)
for (; s < end; ++s) {[](#l4.38)
c = (unsigned char)*s;[](#l4.39)
if (!Py_ISXDIGIT(c))[](#l4.40)
goto error;[](#l4.41) }[](#l4.42)
goto error;[](#l4.43)
}[](#l4.44)
for (; digits--; ++s) {[](#l4.45)
c = (unsigned char)*s;[](#l4.46)
if (!Py_ISXDIGIT(c))[](#l4.47)
goto error;[](#l4.48) chr = (chr<<4) & ~0xF;[](#l4.49) if (c >= '0' && c <= '9')[](#l4.50) chr += c - '0';[](#l4.51)
@@ -5530,24 +5520,16 @@ PyUnicode_DecodeUnicodeEscape(const char else chr += 10 + c - 'A'; }
s += j;[](#l4.56) if (chr == 0xffffffff && PyErr_Occurred())[](#l4.57) /* _decoding_error will have already written into the[](#l4.58) target buffer. */[](#l4.59) break;[](#l4.60) store:[](#l4.61) /* when we get here, chr is a 32-bit unicode character */[](#l4.62)
if (chr <= MAX_UNICODE) {[](#l4.63)
WRITECHAR(chr);[](#l4.64)
} else {[](#l4.65)
endinpos = s-starts;[](#l4.66)
if (unicode_decode_call_errorhandler_writer([](#l4.67)
errors, &errorHandler,[](#l4.68)
"unicodeescape", "illegal Unicode character",[](#l4.69)
&starts, &end, &startinpos, &endinpos, &exc, &s,[](#l4.70)
&writer))[](#l4.71)
goto onError;[](#l4.72)
}[](#l4.73)
message = "illegal Unicode character";[](#l4.74)
if (chr > MAX_UNICODE)[](#l4.75)
goto error;[](#l4.76)
WRITECHAR(chr);[](#l4.77) break;[](#l4.78)
/* \N{name} */ @@ -5575,26 +5557,13 @@ PyUnicode_DecodeUnicodeEscape(const char goto store; } }
endinpos = s-starts;[](#l4.85)
if (unicode_decode_call_errorhandler_writer([](#l4.86)
errors, &errorHandler,[](#l4.87)
"unicodeescape", message,[](#l4.88)
&starts, &end, &startinpos, &endinpos, &exc, &s,[](#l4.89)
&writer))[](#l4.90)
goto onError;[](#l4.91)
break;[](#l4.92)
goto error;[](#l4.93)
default: if (s > end) { message = "\ at end of string"; s--;
endinpos = s-starts;[](#l4.99)
if (unicode_decode_call_errorhandler_writer([](#l4.100)
errors, &errorHandler,[](#l4.101)
"unicodeescape", message,[](#l4.102)
&starts, &end, &startinpos, &endinpos, &exc, &s,[](#l4.103)
&writer))[](#l4.104)
goto onError;[](#l4.105)
goto error;[](#l4.106) }[](#l4.107) else {[](#l4.108) WRITECHAR('\\');[](#l4.109)
@@ -5602,8 +5571,17 @@ PyUnicode_DecodeUnicodeEscape(const char } break; }
nextByte:[](#l4.114)
;[](#l4.115)
continue;[](#l4.116)
error:[](#l4.118)
endinpos = s-starts;[](#l4.119)
if (unicode_decode_call_errorhandler_writer([](#l4.120)
errors, &errorHandler,[](#l4.121)
"unicodeescape", message,[](#l4.122)
&starts, &end, &startinpos, &endinpos, &exc, &s,[](#l4.123)
&writer))[](#l4.124)
goto onError;[](#l4.125)
}continue;[](#l4.126)