cpython: 95386bbf9471 (original) (raw)
Mercurial > cpython
changeset 89365:95386bbf9471 3.3
Issue #19619: Blacklist non-text codecs in method API str.encode, bytes.decode and bytearray.decode now use an internal API to throw LookupError for known non-text encodings, rather than attempting the encoding or decoding operation and then throwing a TypeError for an unexpected output type. The latter mechanism remains in place for third party non-text encodings. Backported changeset d68df99d7a57. [#19619]
Serhiy Storchaka storchaka@gmail.com | |
---|---|
date | Mon, 24 Feb 2014 14:43:03 +0200 |
parents | c89e495cdff8 |
children | 559ced4bb682 151a498c55e3 |
files | Include/codecs.h Lib/codecs.py Lib/encodings/base64_codec.py Lib/encodings/bz2_codec.py Lib/encodings/hex_codec.py Lib/encodings/quopri_codec.py Lib/encodings/rot_13.py Lib/encodings/uu_codec.py Lib/encodings/zlib_codec.py Lib/test/test_codecs.py Misc/NEWS Objects/unicodeobject.c Python/codecs.c |
diffstat | 13 files changed, 219 insertions(+), 19 deletions(-)[+] [-] Include/codecs.h 27 Lib/codecs.py 14 Lib/encodings/base64_codec.py 1 Lib/encodings/bz2_codec.py 1 Lib/encodings/hex_codec.py 1 Lib/encodings/quopri_codec.py 1 Lib/encodings/rot_13.py 1 Lib/encodings/uu_codec.py 1 Lib/encodings/zlib_codec.py 1 Lib/test/test_codecs.py 42 Misc/NEWS 6 Objects/unicodeobject.c 4 Python/codecs.c 138 |
line wrap: on
line diff
--- a/Include/codecs.h +++ b/Include/codecs.h @@ -94,6 +94,33 @@ PyAPI_FUNC(PyObject *) PyCodec_Decode( const char errors ); +#ifndef PY_LIMITED_API +/ Text codec specific encoding and decoding API. +
- Checks the encoding against a list of codecs which do not
- implement a str<->bytes encoding before attempting the
- operation. +
- Please note that these APIs are internal and should not
- be used in Python C extensions. +
- */ + +PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
PyObject *object,[](#l1.20)
const char *encoding,[](#l1.21)
const char *errors[](#l1.22)
);[](#l1.23)
+ +PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
PyObject *object,[](#l1.26)
const char *encoding,[](#l1.27)
const char *errors[](#l1.28)
);[](#l1.29)
+#endif + + + /* --- Codec Lookup APIs -------------------------------------------------- All APIs return a codec object with incremented refcount and are
--- a/Lib/codecs.py +++ b/Lib/codecs.py @@ -73,9 +73,19 @@ BOM64_BE = BOM_UTF32_BE
Codec base classes (defining the API)
Private API to allow Python 3.4 to blacklist the known non-Unicode
codecs in the standard library. A more general mechanism to
reliably distinguish test encodings from other codecs will hopefully
be defined for Python 3.5
- #
http://bugs.python.org/issue19619[](#l2.14)
See- _is_text_encoding = True # Assume codecs are text encodings by default
def new(cls, encode, decode, streamreader=None, streamwriter=None,
incrementalencoder=None, incrementaldecoder=None, name=None):[](#l2.18)
incrementalencoder=None, incrementaldecoder=None, name=None,[](#l2.19)
*, _is_text_encoding=None):[](#l2.20) self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))[](#l2.21) self.name = name[](#l2.22) self.encode = encode[](#l2.23)
@@ -84,6 +94,8 @@ class CodecInfo(tuple): self.incrementaldecoder = incrementaldecoder self.streamwriter = streamwriter self.streamreader = streamreader
if _is_text_encoding is not None:[](#l2.28)
self._is_text_encoding = _is_text_encoding[](#l2.29) return self[](#l2.30)
--- a/Lib/encodings/base64_codec.py +++ b/Lib/encodings/base64_codec.py @@ -52,4 +52,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader,
--- a/Lib/encodings/bz2_codec.py +++ b/Lib/encodings/bz2_codec.py @@ -74,4 +74,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader,
--- a/Lib/encodings/hex_codec.py +++ b/Lib/encodings/hex_codec.py @@ -52,4 +52,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader,
--- a/Lib/encodings/quopri_codec.py +++ b/Lib/encodings/quopri_codec.py @@ -53,4 +53,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader,
--- a/Lib/encodings/rot_13.py +++ b/Lib/encodings/rot_13.py @@ -43,6 +43,7 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamwriter=StreamWriter, streamreader=StreamReader,
Map
--- a/Lib/encodings/uu_codec.py +++ b/Lib/encodings/uu_codec.py @@ -96,4 +96,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter,
--- a/Lib/encodings/zlib_codec.py +++ b/Lib/encodings/zlib_codec.py @@ -74,4 +74,5 @@ def getregentry(): incrementaldecoder=IncrementalDecoder, streamreader=StreamReader, streamwriter=StreamWriter,
--- a/Lib/test/test_codecs.py +++ b/Lib/test/test_codecs.py @@ -4,6 +4,7 @@ import locale import sys import unittest import warnings +import encodings from test import support @@ -2408,6 +2409,47 @@ class TransformCodecTest(unittest.TestCa sout = reader.readline() self.assertEqual(sout, b"\x80")
- def test_text_to_binary_blacklists_binary_transforms(self):
# Check binary -> binary codecs give a good error for str input[](#l10.16)
bad_input = "bad input type"[](#l10.17)
for encoding in bytes_transform_encodings:[](#l10.18)
fmt = (r"{!r} is not a text encoding; "[](#l10.19)
r"use codecs.encode\(\) to handle arbitrary codecs")[](#l10.20)
msg = fmt.format(encoding)[](#l10.21)
with self.assertRaisesRegex(LookupError, msg) as failure:[](#l10.22)
bad_input.encode(encoding)[](#l10.23)
self.assertIsNone(failure.exception.__cause__)[](#l10.24)
- def test_text_to_binary_blacklists_text_transforms(self):
# Check str.encode gives a good error message for str -> str codecs[](#l10.27)
msg = (r"^'rot_13' is not a text encoding; "[](#l10.28)
r"use codecs.encode\(\) to handle arbitrary codecs")[](#l10.29)
with self.assertRaisesRegex(LookupError, msg):[](#l10.30)
"just an example message".encode("rot_13")[](#l10.31)
- def test_binary_to_text_blacklists_binary_transforms(self):
# Check bytes.decode and bytearray.decode give a good error[](#l10.34)
# message for binary -> binary codecs[](#l10.35)
data = b"encode first to ensure we meet any format restrictions"[](#l10.36)
for encoding in bytes_transform_encodings:[](#l10.37)
encoded_data = codecs.encode(data, encoding)[](#l10.38)
fmt = (r"{!r} is not a text encoding; "[](#l10.39)
r"use codecs.decode\(\) to handle arbitrary codecs")[](#l10.40)
msg = fmt.format(encoding)[](#l10.41)
with self.assertRaisesRegex(LookupError, msg):[](#l10.42)
encoded_data.decode(encoding)[](#l10.43)
with self.assertRaisesRegex(LookupError, msg):[](#l10.44)
bytearray(encoded_data).decode(encoding)[](#l10.45)
- def test_binary_to_text_blacklists_text_transforms(self):
# Check str -> str codec gives a good error for binary input[](#l10.48)
for bad_input in (b"immutable", bytearray(b"mutable")):[](#l10.49)
msg = (r"^'rot_13' is not a text encoding; "[](#l10.50)
r"use codecs.decode\(\) to handle arbitrary codecs")[](#l10.51)
with self.assertRaisesRegex(LookupError, msg) as failure:[](#l10.52)
bad_input.decode("rot_13")[](#l10.53)
self.assertIsNone(failure.exception.__cause__)[](#l10.54)
+ @unittest.skipUnless(sys.platform == 'win32', 'code pages are specific to Windows')
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -10,6 +10,12 @@ What's New in Python 3.3.5 release candi Core and Builtins ----------------- +- Issue #19619: str.encode, bytes.decode and bytearray.decode now use an
- internal API to throw LookupError for known non-text encodings, rather
- than attempting the encoding or decoding operation and then throwing a
- TypeError for an unexpected output type. (The latter mechanism remains
- in place for third party non-text encodings) +
- Issue #20588: Make Python-ast.c C89 compliant.
- Issue #20437: Fixed 21 potential bugs when deleting objects references.
--- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -3129,7 +3129,7 @@ PyUnicode_Decode(const char *s, buffer = PyMemoryView_FromBuffer(&info); if (buffer == NULL) goto onError;
- unicode = _PyCodec_DecodeText(buffer, encoding, errors); if (unicode == NULL) goto onError; if (!PyUnicode_Check(unicode)) { @@ -3489,7 +3489,7 @@ PyUnicode_AsEncodedString(PyObject unic } / Encode via the codec registry */
--- a/Python/codecs.c +++ b/Python/codecs.c @@ -337,18 +337,15 @@ PyObject *PyCodec_StreamWriter(const cha errors is passed to the encoder factory as argument if non-NULL. */ -PyObject *PyCodec_Encode(PyObject *object,
const char *encoding,[](#l13.8)
const char *errors)[](#l13.9)
+static PyObject * +_PyCodec_EncodeInternal(PyObject *object,
PyObject *encoder,[](#l13.12)
const char *encoding,[](#l13.13)
const char *errors)[](#l13.14)
- PyObject *encoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v = NULL;
- encoder = PyCodec_Encoder(encoding);
- if (encoder == NULL)
goto onError;[](#l13.22)
- args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -384,18 +381,15 @@ PyObject *PyCodec_Encode(PyObject *objec errors is passed to the decoder factory as argument if non-NULL. */ -PyObject *PyCodec_Decode(PyObject *object,
const char *encoding,[](#l13.32)
const char *errors)[](#l13.33)
+static PyObject * +_PyCodec_DecodeInternal(PyObject *object,
PyObject *decoder,[](#l13.36)
const char *encoding,[](#l13.37)
const char *errors)[](#l13.38)
- PyObject *decoder = NULL; PyObject *args = NULL, *result = NULL; PyObject *v;
- decoder = PyCodec_Decoder(encoding);
- if (decoder == NULL)
goto onError;[](#l13.46)
- args = args_tuple(object, errors); if (args == NULL) goto onError; @@ -425,6 +419,118 @@ PyObject *PyCodec_Decode(PyObject objec return NULL; } +/ Generic encoding/decoding API */ +PyObject *PyCodec_Encode(PyObject *object,
const char *encoding,[](#l13.57)
const char *errors)[](#l13.58)
+} + +PyObject *PyCodec_Decode(PyObject *object,
const char *encoding,[](#l13.70)
const char *errors)[](#l13.71)
+} + +/* Text encoding/decoding API */ +static +PyObject *codec_getitem_checked(const char *encoding,
const char *operation_name,[](#l13.85)
int index)[](#l13.86)
- /* Backwards compatibility: assume any raw tuple describes a text
* encoding, and the same for anything lacking the private[](#l13.99)
* attribute.[](#l13.100)
*/[](#l13.101)
- if (!PyTuple_CheckExact(codec)) {
attr = _PyObject_GetAttrId(codec, &PyId__is_text_encoding);[](#l13.103)
if (attr == NULL) {[](#l13.104)
if (PyErr_ExceptionMatches(PyExc_AttributeError)) {[](#l13.105)
PyErr_Clear();[](#l13.106)
} else {[](#l13.107)
Py_DECREF(codec);[](#l13.108)
return NULL;[](#l13.109)
}[](#l13.110)
} else {[](#l13.111)
is_text_codec = PyObject_IsTrue(attr);[](#l13.112)
Py_DECREF(attr);[](#l13.113)
if (!is_text_codec) {[](#l13.114)
Py_DECREF(codec);[](#l13.115)
PyErr_Format(PyExc_LookupError,[](#l13.116)
"'%.400s' is not a text encoding; "[](#l13.117)
"use codecs.%s() to handle arbitrary codecs",[](#l13.118)
encoding, operation_name);[](#l13.119)
return NULL;[](#l13.120)
}[](#l13.121)
}[](#l13.122)
- }
+} + +static PyObject * _PyCodec_TextEncoder(const char *encoding) +{
+} + +static PyObject * _PyCodec_TextDecoder(const char *encoding) +{
+} + +PyObject *_PyCodec_EncodeText(PyObject *object,
const char *encoding,[](#l13.142)
const char *errors)[](#l13.143)
+} + +PyObject *_PyCodec_DecodeText(PyObject *object,
const char *encoding,[](#l13.155)
const char *errors)[](#l13.156)
+} + /* Register the error handling callback function error under the name name. This function will be called by the codec when it encounters an unencodable characters/undecodable bytes and doesn't know the