cpython: 2cad20e2e588 (original) (raw)

--- a/Doc/library/codecs.rst +++ b/Doc/library/codecs.rst @@ -1011,6 +1011,11 @@ particular, the following variants typic +-----------------+--------------------------------+--------------------------------+ | cp1258 | windows-1258 | Vietnamese | +-----------------+--------------------------------+--------------------------------+ +| cp65001 | | Windows only: Windows UTF-8 | +| | | (CP_UTF8) | +| | | | +| | | .. versionadded:: 3.3 | ++-----------------+--------------------------------+--------------------------------+ | euc_jp | eucjp, ujis, u-jis | Japanese | +-----------------+--------------------------------+--------------------------------+ | euc_jis_2004 | jisx0213, eucjis2004 | Japanese |

--- a/Doc/whatsnew/3.3.rst +++ b/Doc/whatsnew/3.3.rst @@ -225,6 +225,11 @@ The :mod:~encodings.mbcs codec has be :mod:~encodings.mbcs codec is now supporting all error handlers, instead of only replace to encode and ignore to decode. +A new Windows-only codec has been added: cp65001 (:issue:13247). It is +the Windows code page 65001 (Windows UTF-8, CP_UTF8). For example, it is +used by sys.stdout if the console output code page is set to cp65001 (e.g. +using chcp 65001 command). + Multibyte CJK decoders now resynchronize faster. They only ignore the first byte of an invalid byte sequence. For example, b'\xff\n'.decode('gb2312',[](#l2.13) 'replace') now returns a \n after the replacement character.

new file mode 100644 --- /dev/null +++ b/Lib/encodings/cp65001.py @@ -0,0 +1,40 @@ +""" +Code page 65001: Windows UTF-8 (CP_UTF8). +""" + +import codecs +import functools + +if not hasattr(codecs, 'code_page_encode'):

+ +### Codec APIs + +encode = functools.partial(codecs.code_page_encode, 65001) +decode = functools.partial(codecs.code_page_decode, 65001) + +class IncrementalEncoder(codecs.IncrementalEncoder):

+ +class IncrementalDecoder(codecs.BufferedIncrementalDecoder):

+ +class StreamWriter(codecs.StreamWriter):

+ +class StreamReader(codecs.StreamReader):

+ +### encodings module API + +def getregentry():

--- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -4,6 +4,11 @@ import codecs import locale import sys, _testcapi, io +if sys.platform == 'win32':

+else:

+ try: import ctypes except ImportError: @@ -636,6 +641,107 @@ class UTF8Test(ReadTest): "\U00010fff\uD800") self.assertTrue(codecs.lookup_error("surrogatepass")) +@unittest.skipUnless(sys.platform == 'win32',

+class CP65001Test(ReadTest):

+

+

+

+

+ + + class UTF7Test(ReadTest): encoding = "utf-7" @@ -1747,11 +1853,9 @@ class TransformCodecTest(unittest.TestCa @unittest.skipUnless(sys.platform == 'win32', 'code pages are specific to Windows') class CodePageTest(unittest.TestCase):

- def test_invalid_code_page(self): self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a') self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a') @@ -1804,19 +1908,22 @@ class CodePageTest(unittest.TestCase): self.check_encode(932, ( ('abc', 'strict', b'abc'), ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),

@@ -1857,58 +1964,6 @@ class CodePageTest(unittest.TestCase): (b'[\xff]', 'strict', '[\xff]'), ))

-

-

-

- def test_multibyte_encoding(self): self.check_decode(932, ( (b'\x84\xe9\x80', 'ignore', '\u9a3e'), @@ -1918,7 +1973,7 @@ class CodePageTest(unittest.TestCase): (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'), (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'), ))

@@ -1951,6 +2006,7 @@ def test_main(): UTF16BETest, UTF8Test, UTF8SigTest,

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -341,6 +341,8 @@ Core and Builtins Library ------- +- Issue #13247: Add cp65001 codec, the Windows UTF-8 (CP_UTF8). +