cpython: 2cad20e2e588 (original) (raw)
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -1011,6 +1011,11 @@ particular, the following variants typic
+-----------------+--------------------------------+--------------------------------+
| cp1258 | windows-1258 | Vietnamese |
+-----------------+--------------------------------+--------------------------------+
+| cp65001 | | Windows only: Windows UTF-8 |
+| | | (CP_UTF8
) |
+| | | |
+| | | .. versionadded:: 3.3 |
++-----------------+--------------------------------+--------------------------------+
| euc_jp | eucjp, ujis, u-jis | Japanese |
+-----------------+--------------------------------+--------------------------------+
| euc_jis_2004 | jisx0213, eucjis2004 | Japanese |
--- a/Doc/whatsnew/3.3.rst
+++ b/Doc/whatsnew/3.3.rst
@@ -225,6 +225,11 @@ The :mod:~encodings.mbcs
codec has be
:mod:~encodings.mbcs
codec is now supporting all error handlers, instead of
only replace
to encode and ignore
to decode.
+A new Windows-only codec has been added: cp65001
(:issue:13247
). It is
+the Windows code page 65001 (Windows UTF-8, CP_UTF8
). For example, it is
+used by sys.stdout
if the console output code page is set to cp65001 (e.g.
+using chcp 65001
command).
+
Multibyte CJK decoders now resynchronize faster. They only ignore the first
byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312',[](#l2.13) 'replace')
now returns a \n
after the replacement character.
new file mode 100644 --- /dev/null +++ b/Lib/encodings/cp65001.py @@ -0,0 +1,40 @@ +""" +Code page 65001: Windows UTF-8 (CP_UTF8). +""" + +import codecs +import functools + +if not hasattr(codecs, 'code_page_encode'):
+ +### Codec APIs + +encode = functools.partial(codecs.code_page_encode, 65001) +decode = functools.partial(codecs.code_page_decode, 65001) + +class IncrementalEncoder(codecs.IncrementalEncoder):
+ +class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+ +class StreamWriter(codecs.StreamWriter):
+ +class StreamReader(codecs.StreamReader):
+ +### encodings module API + +def getregentry():
- return codecs.CodecInfo(
name='cp65001',[](#l3.37)
encode=encode,[](#l3.38)
decode=decode,[](#l3.39)
incrementalencoder=IncrementalEncoder,[](#l3.40)
incrementaldecoder=IncrementalDecoder,[](#l3.41)
streamreader=StreamReader,[](#l3.42)
streamwriter=StreamWriter,[](#l3.43)
- )
--- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -4,6 +4,11 @@ import codecs import locale import sys, _testcapi, io +if sys.platform == 'win32':
+ try: import ctypes except ImportError: @@ -636,6 +641,107 @@ class UTF8Test(ReadTest): "\U00010fff\uD800") self.assertTrue(codecs.lookup_error("surrogatepass")) +@unittest.skipUnless(sys.platform == 'win32',
'cp65001 is a Windows-only codec')[](#l4.20)
- def test_encode(self):
tests = [[](#l4.25)
('abc', 'strict', b'abc'),[](#l4.26)
('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),[](#l4.27)
('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),[](#l4.28)
][](#l4.29)
if VISTA_OR_LATER:[](#l4.30)
tests.extend(([](#l4.31)
('\udc80', 'strict', None),[](#l4.32)
('\udc80', 'ignore', b''),[](#l4.33)
('\udc80', 'replace', b'?'),[](#l4.34)
('\udc80', 'backslashreplace', b'\\udc80'),[](#l4.35)
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),[](#l4.36)
))[](#l4.37)
else:[](#l4.38)
tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))[](#l4.39)
for text, errors, expected in tests:[](#l4.40)
if expected is not None:[](#l4.41)
try:[](#l4.42)
encoded = text.encode('cp65001', errors)[](#l4.43)
except UnicodeEncodeError as err:[](#l4.44)
self.fail('Unable to encode %a to cp65001 with '[](#l4.45)
'errors=%r: %s' % (text, errors, err))[](#l4.46)
self.assertEqual(encoded, expected,[](#l4.47)
'%a.encode("cp65001", %r)=%a != %a'[](#l4.48)
% (text, errors, encoded, expected))[](#l4.49)
else:[](#l4.50)
self.assertRaises(UnicodeEncodeError,[](#l4.51)
text.encode, "cp65001", errors)[](#l4.52)
- def test_decode(self):
tests = [[](#l4.55)
(b'abc', 'strict', 'abc'),[](#l4.56)
(b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),[](#l4.57)
(b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),[](#l4.58)
(b'\xef\xbf\xbd', 'strict', '\ufffd'),[](#l4.59)
(b'[\xc3\xa9]', 'strict', '[\xe9]'),[](#l4.60)
# invalid bytes[](#l4.61)
(b'[\xff]', 'strict', None),[](#l4.62)
(b'[\xff]', 'ignore', '[]'),[](#l4.63)
(b'[\xff]', 'replace', '[\ufffd]'),[](#l4.64)
(b'[\xff]', 'surrogateescape', '[\udcff]'),[](#l4.65)
][](#l4.66)
if VISTA_OR_LATER:[](#l4.67)
tests.extend(([](#l4.68)
(b'[\xed\xb2\x80]', 'strict', None),[](#l4.69)
(b'[\xed\xb2\x80]', 'ignore', '[]'),[](#l4.70)
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),[](#l4.71)
))[](#l4.72)
else:[](#l4.73)
tests.extend(([](#l4.74)
(b'[\xed\xb2\x80]', 'strict', '[\udc80]'),[](#l4.75)
))[](#l4.76)
for raw, errors, expected in tests:[](#l4.77)
if expected is not None:[](#l4.78)
try:[](#l4.79)
decoded = raw.decode('cp65001', errors)[](#l4.80)
except UnicodeDecodeError as err:[](#l4.81)
self.fail('Unable to decode %a from cp65001 with '[](#l4.82)
'errors=%r: %s' % (raw, errors, err))[](#l4.83)
self.assertEqual(decoded, expected,[](#l4.84)
'%a.decode("cp65001", %r)=%a != %a'[](#l4.85)
% (raw, errors, decoded, expected))[](#l4.86)
else:[](#l4.87)
self.assertRaises(UnicodeDecodeError,[](#l4.88)
raw.decode, 'cp65001', errors)[](#l4.89)
- @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
- def test_lone_surrogates(self):
self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")[](#l4.93)
self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")[](#l4.94)
self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),[](#l4.95)
b'[\\udc80]')[](#l4.96)
self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),[](#l4.97)
b'[�]')[](#l4.98)
self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),[](#l4.99)
b'[\x80]')[](#l4.100)
self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),[](#l4.101)
b'[]')[](#l4.102)
self.assertEqual("[\uDC80]".encode("cp65001", "replace"),[](#l4.103)
b'[?]')[](#l4.104)
- @unittest.skipUnless(VISTA_OR_LATER, 'require Windows Vista or later')
- def test_surrogatepass_handler(self):
self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),[](#l4.108)
b"abc\xed\xa0\x80def")[](#l4.109)
self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),[](#l4.110)
"abc\ud800def")[](#l4.111)
self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),[](#l4.112)
b"\xf0\x90\xbf\xbf\xed\xa0\x80")[](#l4.113)
self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),[](#l4.114)
"\U00010fff\uD800")[](#l4.115)
self.assertTrue(codecs.lookup_error("surrogatepass"))[](#l4.116)
+ + + class UTF7Test(ReadTest): encoding = "utf-7" @@ -1747,11 +1853,9 @@ class TransformCodecTest(unittest.TestCa @unittest.skipUnless(sys.platform == 'win32', 'code pages are specific to Windows') class CodePageTest(unittest.TestCase):
- def test_invalid_code_page(self): self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a') self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a') @@ -1804,19 +1908,22 @@ class CodePageTest(unittest.TestCase): self.check_encode(932, ( ('abc', 'strict', b'abc'), ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
# not encodable[](#l4.140)
# test error handlers[](#l4.141) ('\xff', 'strict', None),[](#l4.142) ('[\xff]', 'ignore', b'[]'),[](#l4.143) ('[\xff]', 'replace', b'[y]'),[](#l4.144) ('[\u20ac]', 'replace', b'[?]'),[](#l4.145)
('[\xff]', 'backslashreplace', b'[\\xff]'),[](#l4.146)
('[\xff]', 'xmlcharrefreplace', b'[ÿ]'),[](#l4.147) ))[](#l4.148) self.check_decode(932, ([](#l4.149) (b'abc', 'strict', 'abc'),[](#l4.150) (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),[](#l4.151) # invalid bytes[](#l4.152)
(b'\xff', 'strict', None),[](#l4.153)
(b'\xff', 'ignore', ''),[](#l4.154)
(b'\xff', 'replace', '\ufffd'),[](#l4.155)
(b'[\xff]', 'strict', None),[](#l4.156)
(b'[\xff]', 'ignore', '[]'),[](#l4.157)
(b'[\xff]', 'replace', '[\ufffd]'),[](#l4.158)
(b'[\xff]', 'surrogateescape', '[\udcff]'),[](#l4.159) (b'\x81\x00abc', 'strict', None),[](#l4.160) (b'\x81\x00abc', 'ignore', '\x00abc'),[](#l4.161) (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),[](#l4.162)
@@ -1857,58 +1964,6 @@ class CodePageTest(unittest.TestCase): (b'[\xff]', 'strict', '[\xff]'), ))
tests = [[](#l4.170)
('abc', 'strict', b'abc'),[](#l4.171)
('\xe9\u20ac', 'strict', b'\xc3\xa9\xe2\x82\xac'),[](#l4.172)
('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),[](#l4.173)
][](#l4.174)
if self.vista_or_later():[](#l4.175)
tests.append(('\udc80', 'strict', None))[](#l4.176)
tests.append(('\udc80', 'ignore', b''))[](#l4.177)
tests.append(('\udc80', 'replace', b'?'))[](#l4.178)
else:[](#l4.179)
tests.append(('\udc80', 'strict', b'\xed\xb2\x80'))[](#l4.180)
self.check_encode(cp, tests)[](#l4.181)
tests = [[](#l4.183)
(b'abc', 'strict', 'abc'),[](#l4.184)
(b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),[](#l4.185)
(b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),[](#l4.186)
(b'\xef\xbf\xbd', 'strict', '\ufffd'),[](#l4.187)
(b'[\xc3\xa9]', 'strict', '[\xe9]'),[](#l4.188)
# invalid bytes[](#l4.189)
(b'[\xff]', 'strict', None),[](#l4.190)
(b'[\xff]', 'ignore', '[]'),[](#l4.191)
(b'[\xff]', 'replace', '[\ufffd]'),[](#l4.192)
][](#l4.193)
if self.vista_or_later():[](#l4.194)
tests.extend(([](#l4.195)
(b'[\xed\xb2\x80]', 'strict', None),[](#l4.196)
(b'[\xed\xb2\x80]', 'ignore', '[]'),[](#l4.197)
(b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),[](#l4.198)
))[](#l4.199)
else:[](#l4.200)
tests.extend(([](#l4.201)
(b'[\xed\xb2\x80]', 'strict', '[\udc80]'),[](#l4.202)
))[](#l4.203)
self.check_decode(cp, tests)[](#l4.204)
- def test_error_handlers(self):
self.check_encode(932, ([](#l4.207)
('\xff', 'backslashreplace', b'\\xff'),[](#l4.208)
('\xff', 'xmlcharrefreplace', b'ÿ'),[](#l4.209)
))[](#l4.210)
self.check_decode(932, ([](#l4.211)
(b'\xff', 'surrogateescape', '\udcff'),[](#l4.212)
))[](#l4.213)
if self.vista_or_later():[](#l4.214)
self.check_encode(self.CP_UTF8, ([](#l4.215)
('\udc80', 'surrogatepass', b'\xed\xb2\x80'),[](#l4.216)
))[](#l4.217)
- def test_multibyte_encoding(self): self.check_decode(932, ( (b'\x84\xe9\x80', 'ignore', '\u9a3e'), @@ -1918,7 +1973,7 @@ class CodePageTest(unittest.TestCase): (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), ))
if self.vista_or_later():[](#l4.226)
if VISTA_OR_LATER:[](#l4.227) self.check_encode(self.CP_UTF8, ([](#l4.228) ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),[](#l4.229) ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),[](#l4.230)
@@ -1951,6 +2006,7 @@ def test_main(): UTF16BETest, UTF8Test, UTF8SigTest,
CP65001Test,[](#l4.235) UTF7Test,[](#l4.236) UTF16ExTest,[](#l4.237) ReadBufferTest,[](#l4.238)
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -341,6 +341,8 @@ Core and Builtins Library ------- +- Issue #13247: Add cp65001 codec, the Windows UTF-8 (CP_UTF8). +