[issue3672] Ill-formed surrogates not treated as errors during encoding/decoding - Code Review (original) (raw)
OLD
NEW
1 /* ------------------------------------------------------------------------
1 /* ------------------------------------------------------------------------
2
2
3 Python Codec Registry and support functions
3 Python Codec Registry and support functions
4
4
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
5 Written by Marc-Andre Lemburg (mal@lemburg.com).
6
6
7 Copyright (c) Corporation for National Research Initiatives.
7 Copyright (c) Corporation for National Research Initiatives.
8
8
9 ------------------------------------------------------------------------ */
9 ------------------------------------------------------------------------ */
10
10
(...skipping 730 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading...
741 Py_DECREF(res);
741 Py_DECREF(res);
742 Py_DECREF(object);
742 Py_DECREF(object);
743 return restuple;
743 return restuple;
744 }
744 }
745 else {
745 else {
746 wrong_exception_type(exc);
746 wrong_exception_type(exc);
747 return NULL;
747 return NULL;
748 }
748 }
749 }
749 }
750
750
751 PyObject *PyCodec_SurrogateErrors(PyObject *exc)
752 {
753 PyObject *restuple;
754 PyObject *object;
755 Py_ssize_t start;
756 Py_ssize_t end;
757 PyObject *res;
758 if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
759 Py_UNICODE *p;
760 Py_UNICODE *startp;
761 char *outp;
762 if (PyUnicodeEncodeError_GetStart(exc, &start))
763 return NULL;
764 if (PyUnicodeEncodeError_GetEnd(exc, &end))
765 return NULL;
766 if (!(object = PyUnicodeEncodeError_GetObject(exc)))
767 return NULL;
768 startp = PyUnicode_AS_UNICODE(object);
769 res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
770 if (!res)
771 return NULL;
772 outp = PyBytes_AsString(res);
773 for (p = startp+start; p < startp+end; p++) {
774 Py_UNICODE ch = *p;
775 if (ch < 0xd800 || ch > 0xdfff) {
776 /* Not a surrogate, fail with original exception */
777 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
778 Py_DECREF(res);
779 Py_DECREF(object);
780 return NULL;
781 }
782 *outp++ = (char)(0xe0 | (ch >> 12));
783 *outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
784 *outp++ = (char)(0x80 | (ch & 0x3f));
785 }
786 restuple = Py_BuildValue("(On)", res, end);
787 Py_DECREF(res);
788 Py_DECREF(object);
789 return restuple;
790 }
791 else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
792 unsigned char *p;
793 Py_UNICODE ch = 0;
794 if (PyUnicodeDecodeError_GetStart(exc, &start))
795 return NULL;
796 if (!(object = PyUnicodeDecodeError_GetObject(exc)))
797 return NULL;
798 if (!(p = (unsigned char*)PyBytes_AsString(object))) {
799 Py_DECREF(object);
800 return NULL;
801 }
802 /* Try decoding a single surrogate character. If
803 there are more, let the codec call us again. */
804 if ((p[0] & 0xf0) == 0xe0 ||·
805 (p[1] & 0xc0) == 0x80 ||
806 (p[2] & 0xc0) == 0x80) {
807 /* it's a three-byte code */
808 ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
809 if (ch < 0xd800 || ch > 0xdfff)
810 /* it's not a surrogate - fail */
811 ch = 0;
812 }
813 Py_DECREF(object);
814 if (ch == 0) {
815 PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
816 return NULL;
817 }
818 return Py_BuildValue("(u#n)", &ch, 1, start+3);
819 }
820 else {
821 wrong_exception_type(exc);
822 return NULL;
823 }
824 }
825
826 ········
751 static PyObject *strict_errors(PyObject *self, PyObject *exc)
827 static PyObject *strict_errors(PyObject *self, PyObject *exc)
752 {
828 {
753 return PyCodec_StrictErrors(exc);
829 return PyCodec_StrictErrors(exc);
754 }
830 }
755
831
756
832
757 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
833 static PyObject *ignore_errors(PyObject *self, PyObject *exc)
758 {
834 {
759 return PyCodec_IgnoreErrors(exc);
835 return PyCodec_IgnoreErrors(exc);
760 }
836 }
761
837
762
838
763 static PyObject *replace_errors(PyObject *self, PyObject *exc)
839 static PyObject *replace_errors(PyObject *self, PyObject *exc)
764 {
840 {
765 return PyCodec_ReplaceErrors(exc);
841 return PyCodec_ReplaceErrors(exc);
766 }
842 }
767
843
768
844
769 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
845 static PyObject *xmlcharrefreplace_errors(PyObject *self, PyObject *exc)
770 {
846 {
771 return PyCodec_XMLCharRefReplaceErrors(exc);
847 return PyCodec_XMLCharRefReplaceErrors(exc);
772 }
848 }
773
849
774
850
775 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
851 static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
776 {
852 {
777 return PyCodec_BackslashReplaceErrors(exc);
853 return PyCodec_BackslashReplaceErrors(exc);
778 }
854 }
779
855
856 static PyObject *surrogates_errors(PyObject *self, PyObject *exc)
857 {
858 return PyCodec_SurrogateErrors(exc);
859 }
860
780 static int _PyCodecRegistry_Init(void)
861 static int _PyCodecRegistry_Init(void)
781 {
862 {
782 static struct {
863 static struct {
783 char *name;
864 char *name;
784 PyMethodDef def;
865 PyMethodDef def;
785 } methods[] =
866 } methods[] =
786 {
867 {
787 {
868 {
788 "strict",
869 "strict",
789 {
870 {
(...skipping 26 matching lines...) Expand all Loading...
816 METH_O
897 METH_O
817 }
898 }
818 },
899 },
819 {
900 {
820 "backslashreplace",
901 "backslashreplace",
821 {
902 {
822 "backslashreplace_errors",
903 "backslashreplace_errors",
823 backslashreplace_errors,
904 backslashreplace_errors,
824 METH_O
905 METH_O
825 }
906 }
907 },
908 {
909 "surrogates",
910 {
911 "surrogates",
912 surrogates_errors,
913 METH_O
914 }
826 }
915 }
827 };
916 };
828
917
829 PyInterpreterState *interp = PyThreadState_GET()->interp;
918 PyInterpreterState *interp = PyThreadState_GET()->interp;
830 PyObject *mod;
919 PyObject *mod;
831 unsigned i;
920 unsigned i;
832
921
833 if (interp->codec_search_path != NULL)
922 if (interp->codec_search_path != NULL)
834 return 0;
923 return 0;
835
924
(...skipping 29 matching lines...) Expand all Loading...
865 configuration are still reported back to the user. */
954 configuration are still reported back to the user. */
866 PyErr_Clear();
955 PyErr_Clear();
867 return 0;
956 return 0;
868 }
957 }
869 return -1;
958 return -1;
870 }
959 }
871 Py_DECREF(mod);
960 Py_DECREF(mod);
872 interp->codecs_initialized = 1;
961 interp->codecs_initialized = 1;
873 return 0;
962 return 0;
874 }
963 }
OLD
NEW