(original) (raw)

changeset: 81519:6ac4f1609847 branch: 3.3 parent: 81511:ad9b5c69b8b6 parent: 81518:13cd78a2a17b user: Serhiy Storchaka storchaka@gmail.com date: Tue Jan 15 15:01:20 2013 +0200 files: Lib/test/test_codecs.py Misc/NEWS Objects/unicodeobject.c description: Issue #14850: Now a chamap decoder treates U+FFFE as "undefined mapping" in any mapping, not only in an unicode string. diff -r ad9b5c69b8b6 -r 6ac4f1609847 Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Tue Jan 15 13:27:28 2013 +0200 +++ b/Lib/test/test_codecs.py Tue Jan 15 15:01:20 2013 +0200 @@ -1737,6 +1737,10 @@ codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab" ) + self.assertRaises(UnicodeDecodeError, + codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe" + ) + self.assertEqual( codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"), ("ab\ufffd", 3) @@ -1793,6 +1797,17 @@ {0: 'a', 1: 'b'} ) + self.assertRaises(UnicodeDecodeError, + codecs.charmap_decode, b"\x00\x01\x02", "strict", + {0: 'a', 1: 'b', 2: None} + ) + + # Issue #14850 + self.assertRaises(UnicodeDecodeError, + codecs.charmap_decode, b"\x00\x01\x02", "strict", + {0: 'a', 1: 'b', 2: '\ufffe'} + ) + self.assertEqual( codecs.charmap_decode(b"\x00\x01\x02", "replace", {0: 'a', 1: 'b'}), @@ -1805,6 +1820,13 @@ ("ab\ufffd", 3) ) + # Issue #14850 + self.assertEqual( + codecs.charmap_decode(b"\x00\x01\x02", "replace", + {0: 'a', 1: 'b', 2: '\ufffe'}), + ("ab\ufffd", 3) + ) + self.assertEqual( codecs.charmap_decode(b"\x00\x01\x02", "ignore", {0: 'a', 1: 'b'}), @@ -1817,6 +1839,13 @@ ("ab", 3) ) + # Issue #14850 + self.assertEqual( + codecs.charmap_decode(b"\x00\x01\x02", "ignore", + {0: 'a', 1: 'b', 2: '\ufffe'}), + ("ab", 3) + ) + allbytes = bytes(range(256)) self.assertEqual( codecs.charmap_decode(allbytes, "ignore", {}), @@ -1857,6 +1886,11 @@ {0: a, 1: b}, ) + self.assertRaises(UnicodeDecodeError, + codecs.charmap_decode, b"\x00\x01\x02", "strict", + {0: a, 1: b, 2: 0xFFFE}, + ) + self.assertEqual( codecs.charmap_decode(b"\x00\x01\x02", "replace", {0: a, 1: b}), @@ -1864,11 +1898,23 @@ ) self.assertEqual( + codecs.charmap_decode(b"\x00\x01\x02", "replace", + {0: a, 1: b, 2: 0xFFFE}), + ("ab\ufffd", 3) + ) + + self.assertEqual( codecs.charmap_decode(b"\x00\x01\x02", "ignore", {0: a, 1: b}), ("ab", 3) ) + self.assertEqual( + codecs.charmap_decode(b"\x00\x01\x02", "ignore", + {0: a, 1: b, 2: 0xFFFE}), + ("ab", 3) + ) + class WithStmtTest(unittest.TestCase): def test_encodedfile(self): diff -r ad9b5c69b8b6 -r 6ac4f1609847 Misc/NEWS --- a/Misc/NEWS Tue Jan 15 13:27:28 2013 +0200 +++ b/Misc/NEWS Tue Jan 15 15:01:20 2013 +0200 @@ -12,6 +12,9 @@ Core and Builtins ----------------- +- Issue #14850: Now a chamap decoder treates U+FFFE as "undefined mapping" + in any mapping, not only in a string. + - Issue #16730: importlib.machinery.FileFinder now no longers raises an exception when trying to populate its cache and it finds out the directory is unreadable or has turned into a file. Reported and diagnosed by diff -r ad9b5c69b8b6 -r 6ac4f1609847 Objects/unicodeobject.c --- a/Objects/unicodeobject.c Tue Jan 15 13:27:28 2013 +0200 +++ b/Objects/unicodeobject.c Tue Jan 15 15:01:20 2013 +0200 @@ -7511,15 +7511,18 @@ if (PyErr_ExceptionMatches(PyExc_LookupError)) { /* No mapping found means: mapping is undefined. */ PyErr_Clear(); - x = Py_None; - Py_INCREF(x); + goto Undefined; } else goto onError; } /* Apply mapping */ + if (x == Py_None) + goto Undefined; if (PyLong_Check(x)) { long value = PyLong_AS_LONG(x); + if (value == 0xFFFE) + goto Undefined; if (value < 0 || value > MAX_UNICODE) { PyErr_Format(PyExc_TypeError, "character mapping must be in range(0x%lx)", @@ -7530,21 +7533,6 @@ if (unicode_putchar(&v, &outpos, value) < 0) goto onError; } - else if (x == Py_None) { - /* undefined mapping */ - startinpos = s-starts; - endinpos = startinpos+1; - if (unicode_decode_call_errorhandler( - errors, &errorHandler, - "charmap", "character maps to ", - &starts, &e, &startinpos, &endinpos, &exc, &s, - &v, &outpos)) { - Py_DECREF(x); - goto onError; - } - Py_DECREF(x); - continue; - } else if (PyUnicode_Check(x)) { Py_ssize_t targetsize; @@ -7554,8 +7542,10 @@ if (targetsize == 1) { /* 1-1 mapping */ - if (unicode_putchar(&v, &outpos, - PyUnicode_READ_CHAR(x, 0)) < 0) + Py_UCS4 value = PyUnicode_READ_CHAR(x, 0); + if (value == 0xFFFE) + goto Undefined; + if (unicode_putchar(&v, &outpos, value) < 0) goto onError; } else if (targetsize > 1) { @@ -7590,6 +7580,19 @@ } Py_DECREF(x); ++s; + continue; +Undefined: + /* undefined mapping */ + Py_XDECREF(x); + startinpos = s-starts; + endinpos = startinpos+1; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "charmap", "character maps to ", + &starts, &e, &startinpos, &endinpos, &exc, &s, + &v, &outpos)) { + goto onError; + } } } if (unicode_resize(&v, outpos) < 0)/storchaka@gmail.com