cpython: 037253b7cd6d (original) (raw)
--- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -926,6 +926,32 @@ class CP65001Test(ReadTest, unittest.Tes class UTF7Test(ReadTest, unittest.TestCase): encoding = "utf-7"
- def test_ascii(self):
# Set D (directly encoded characters)[](#l1.8)
set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'[](#l1.9)
'abcdefghijklmnopqrstuvwxyz'[](#l1.10)
'0123456789'[](#l1.11)
'\'(),-./:?')[](#l1.12)
self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))[](#l1.13)
self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)[](#l1.14)
# Set O (optional direct characters)[](#l1.15)
set_o = ' !"#$%&*;<=>@[]^_`{|}'[](#l1.16)
self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))[](#l1.17)
self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)[](#l1.18)
# +[](#l1.19)
self.assertEqual('a+b'.encode(self.encoding), b'a+-b')[](#l1.20)
self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')[](#l1.21)
# White spaces[](#l1.22)
ws = ' \t\n\r'[](#l1.23)
self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))[](#l1.24)
self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)[](#l1.25)
# Other ASCII characters[](#l1.26)
other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -[](#l1.27)
set(set_d + set_o + '+' + ws)))[](#l1.28)
self.assertEqual(other_ascii.encode(self.encoding),[](#l1.29)
b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'[](#l1.30)
b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')[](#l1.31)
+ def test_partial(self): self.check_partial( 'a+-b\x00c\x80d\u0100e\U00010000f', @@ -967,7 +993,9 @@ class UTF7Test(ReadTest, unittest.TestCa def test_errors(self): tests = [
(b'\xffb', '\ufffdb'),[](#l1.40) (b'a\xffb', 'a\ufffdb'),[](#l1.41)
(b'a\xff\xffb', 'a\ufffd\ufffdb'),[](#l1.42) (b'a+IK', 'a\ufffd'),[](#l1.43) (b'a+IK-b', 'a\ufffdb'),[](#l1.44) (b'a+IK,b', 'a\ufffdb'),[](#l1.45)
@@ -983,6 +1011,8 @@ class UTF7Test(ReadTest, unittest.TestCa (b'a+//,+IKw-b', 'a\ufffd\u20acb'), (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'), (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
(b'a+IKw-b\xff', 'a\u20acb\ufffd'),[](#l1.50)
(b'a+IKw\xffb', 'a\u20ac\ufffdb'),[](#l1.51) ][](#l1.52) for raw, expected in tests:[](#l1.53) with self.subTest(raw=raw):[](#l1.54)
@@ -994,8 +1024,36 @@ class UTF7Test(ReadTest, unittest.TestCa self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-') self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0') -
self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')[](#l1.61)
self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')[](#l1.62)
self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')[](#l1.63)
self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')[](#l1.64)
self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),[](#l1.65)
b'+IKwgrNgB3KA-')[](#l1.66)
self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),[](#l1.67)
'\u20ac\u20ac\U000104A0')[](#l1.68)
self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),[](#l1.69)
'\u20ac\u20ac\U000104A0')[](#l1.70)
- def test_lone_surrogates(self):
tests = [[](#l1.73)
(b'a+2AE-b', 'a\ud801b'),[](#l1.74)
(b'a+2AE\xffb', 'a\ufffdb'),[](#l1.75)
(b'a+2AE', 'a\ufffd'),[](#l1.76)
(b'a+2AEA-b', 'a\ufffdb'),[](#l1.77)
(b'a+2AH-b', 'a\ufffdb'),[](#l1.78)
(b'a+IKzYAQ-b', 'a\u20ac\ud801b'),[](#l1.79)
(b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),[](#l1.80)
(b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),[](#l1.81)
(b'a+IKzYAd-b', 'a\u20ac\ufffdb'),[](#l1.82)
(b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),[](#l1.83)
(b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),[](#l1.84)
(b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),[](#l1.85)
(b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),[](#l1.86)
][](#l1.87)
for raw, expected in tests:[](#l1.88)
with self.subTest(raw=raw):[](#l1.89)
self.assertEqual(raw.decode('utf-7', 'replace'), expected)[](#l1.90)
class UTF16ExTest(unittest.TestCase):
--- a/Lib/test/test_unicode.py +++ b/Lib/test/test_unicode.py @@ -1553,7 +1553,7 @@ class UnicodeTest(string_tests.CommonTes self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde') # Issue #2242: crash on some Windows/MSVC versions
self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')[](#l2.7)
self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')[](#l2.8)
# Direct encoded characters set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?" @@ -1995,6 +1995,7 @@ class UnicodeTest(string_tests.CommonTes self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict') self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x") self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')[](#l2.16)
# Error handling (unknown character names) self.assertEqual(b"\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,8 @@ Release date: XXXX-XX-XX Core and Builtins ----------------- +- Issue #24848: Fixed a number of bugs in UTF-7 decoding of misformed data. +
- Issue #25267: The UTF-8 encoder is now up to 75 times as fast for error
handlers:
ignore
,replace
,surrogateescape
,surrogatepass
. Patch co-written with Serhiy Storchaka.
--- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -4360,31 +4360,31 @@ PyUnicode_DecodeUTF7Stateful(const char } else { /* now leaving a base-64 section */ inShift = 0;
s++;[](#l5.7)
if (surrogate) {[](#l5.8)
if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)[](#l5.9)
goto onError;[](#l5.10)
surrogate = 0;[](#l5.11)
}[](#l5.12) if (base64bits > 0) { /* left-over bits */[](#l5.13) if (base64bits >= 6) {[](#l5.14) /* We've seen at least one base-64 character */[](#l5.15)
s++;[](#l5.16) errmsg = "partial character in shift sequence";[](#l5.17) goto utf7Error;[](#l5.18) }[](#l5.19) else {[](#l5.20) /* Some bits remain; they should be zero */[](#l5.21) if (base64buffer != 0) {[](#l5.22)
s++;[](#l5.23) errmsg = "non-zero padding bits in shift sequence";[](#l5.24) goto utf7Error;[](#l5.25) }[](#l5.26) }[](#l5.27) }[](#l5.28)
if (ch != '-') {[](#l5.29)
if (surrogate && DECODE_DIRECT(ch)) {[](#l5.30)
if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)[](#l5.31)
goto onError;[](#l5.32)
}[](#l5.33)
surrogate = 0;[](#l5.34)
if (ch == '-') {[](#l5.35) /* '-' is absorbed; other terminating[](#l5.36) characters are preserved */[](#l5.37)
if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)[](#l5.38)
goto onError;[](#l5.39)
s++;[](#l5.40) }[](#l5.41) }[](#l5.42) }[](#l5.43)
@@ -4398,6 +4398,7 @@ PyUnicode_DecodeUTF7Stateful(const char } else { /* begin base64-encoded section */ inShift = 1;
surrogate = 0;[](#l5.48) shiftOutStart = writer.pos;[](#l5.49) base64bits = 0;[](#l5.50) base64buffer = 0;[](#l5.51)
@@ -4429,6 +4430,7 @@ utf7Error: if (inShift && !consumed) { /* in shift sequence, no more to follow / / if we're in an inconsistent state, that's an error */
inShift = 0;[](#l5.56) if (surrogate ||[](#l5.57) (base64bits >= 6) ||[](#l5.58) (base64bits > 0 && base64buffer != 0)) {[](#l5.59)
@@ -13366,6 +13368,7 @@ int if (maxchar > writer->maxchar || writer->readonly) { /* resize + widen */
maxchar = Py_MAX(maxchar, writer->maxchar);[](#l5.64) newbuffer = PyUnicode_New(newlen, maxchar);[](#l5.65) if (newbuffer == NULL)[](#l5.66) return -1;[](#l5.67)