(original) (raw)

Index: Objects/unicodeobject.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Objects/unicodeobject.c,v retrieving revision 2.231 diff -u -r2.231 unicodeobject.c --- Objects/unicodeobject.c 30 Aug 2005 10:23:14 -0000 2.231 +++ Objects/unicodeobject.c 5 Oct 2005 14:39:34 -0000 @@ -3606,6 +3606,146 @@ return NULL; } +/* --- Fast Mapping Decoder ----------------------------------------------- */ + +PyObject *PyUnicode_DecodeFastmap(const char *s, + int size, + const Py_UNICODE *table, + const char *errors) +{ + const char *starts = s; + int startinpos; + int endinpos; + int outpos; + const char *e; + PyUnicodeObject *v; + Py_UNICODE *p; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + + v = _PyUnicode_New(size); + if (v == NULL) + goto onError; + if (size == 0) + return (PyObject *)v; + p = PyUnicode_AS_UNICODE(v); + e = s + size; + while (s < e) { + unsigned char ch = *s; + Py_UNICODE value; + + value = table[(int)ch]; + + /* Apply mapping */ + if (value != Py_UNICODE_REPLACEMENT_CHARACTER) + *p++ = (Py_UNICODE)value; + else { + /* undefined mapping */ + outpos = p-PyUnicode_AS_UNICODE(v); + startinpos = s-starts; + endinpos = startinpos+1; + if (unicode_decode_call_errorhandler( + errors, &errorHandler, + "fastmap", "fastmap to ", + starts, size, &startinpos, &endinpos, &exc, &s, + (PyObject **)&v, &outpos, &p)) { + goto onError; + } + continue; + } + ++s; + } + if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) + if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0) + goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return (PyObject *)v; + + onError: + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + Py_XDECREF(v); + return NULL; +} + +PyObject *PyUnicode_EncodeFastmap(const Py_UNICODE *p, + int size, + const unsigned char *table, + const PyFastmap_EncodingIndex *index, + int indexsize, + const char *errors) +{ + const Py_UNICODE *e; + char *op; + PyObject *v; + PyObject *errorHandler = NULL; + PyObject *exc = NULL; + + v = PyString_FromStringAndSize(NULL, size); + if (v == NULL) + goto onError; + if (size == 0) + return (PyObject *)v; + op = PyString_AS_STRING(v); + e = p + size; + while (p < e) { + Py_UNICODE ch = *p; + unsigned char high, low, coded = 0; + const PyFastmap_EncodingIndex *segment = NULL; + + high = ch >> 8; + low = ch & 0xff; + + if (high == 0) /* fast path for latin-1 area */ + segment = index; + else { + /* XXX: this may be improved using binary search */ + int i; + for (i = 1; i < indexsize; i++) + if (index[i].high == high) { + segment = &index[i]; + break; + } + } + + if (segment == NULL) + /* segment not found */; + else if (segment->lowfirst == 0xff && segment->lowlast == 0x00) + /* special case for the full mapping to one segment */ + coded = table[segment->mapindex + low]; + else if (low < segment->lowfirst || segment->lowlast < low) + segment = NULL; /* out of the segment */ + else { + coded = table[segment->mapindex + low - segment->lowfirst]; + if (coded == segment->undefmark) + segment = NULL; /* marked as undefined */ + } + + /* Apply mapping */ + if (segment != NULL) + *op++ = (char)coded; + else { + PyErr_SetString(PyExc_NotImplementedError, + "error handling is not implemented yet. :-)"); + goto onError; + } + ++p; + } + if (op - PyString_AS_STRING(v) < PyString_GET_SIZE(v)) + if (_PyString_Resize(&v, (int)(op - PyString_AS_STRING(v))) < 0) + goto onError; + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + return (PyObject *)v; + + onError: + Py_XDECREF(errorHandler); + Py_XDECREF(exc); + Py_XDECREF(v); + return NULL; +} + /* --- Decimal Encoder ---------------------------------------------------- */ int PyUnicode_EncodeDecimal(Py_UNICODE *s, Index: Include/unicodeobject.h =================================================================== RCS file: /cvsroot/python/python/dist/src/Include/unicodeobject.h,v retrieving revision 2.49 diff -u -r2.49 unicodeobject.h --- Include/unicodeobject.h 30 Aug 2005 10:23:13 -0000 2.49 +++ Include/unicodeobject.h 5 Oct 2005 14:39:34 -0000 @@ -157,6 +157,7 @@ # define PyUnicode_Decode PyUnicodeUCS2_Decode # define PyUnicode_DecodeASCII PyUnicodeUCS2_DecodeASCII # define PyUnicode_DecodeCharmap PyUnicodeUCS2_DecodeCharmap +# define PyUnicode_DecodeFastmap PyUnicodeUCS2_DecodeFastmap # define PyUnicode_DecodeLatin1 PyUnicodeUCS2_DecodeLatin1 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS2_DecodeRawUnicodeEscape # define PyUnicode_DecodeUTF16 PyUnicodeUCS2_DecodeUTF16 @@ -168,6 +169,7 @@ # define PyUnicode_EncodeASCII PyUnicodeUCS2_EncodeASCII # define PyUnicode_EncodeCharmap PyUnicodeUCS2_EncodeCharmap # define PyUnicode_EncodeDecimal PyUnicodeUCS2_EncodeDecimal +# define PyUnicode_EncodeFastmap PyUnicodeUCS2_EncodeFastmap # define PyUnicode_EncodeLatin1 PyUnicodeUCS2_EncodeLatin1 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS2_EncodeRawUnicodeEscape # define PyUnicode_EncodeUTF16 PyUnicodeUCS2_EncodeUTF16 @@ -232,6 +234,7 @@ # define PyUnicode_Decode PyUnicodeUCS4_Decode # define PyUnicode_DecodeASCII PyUnicodeUCS4_DecodeASCII # define PyUnicode_DecodeCharmap PyUnicodeUCS4_DecodeCharmap +# define PyUnicode_DecodeFastmap PyUnicodeUCS4_DecodeFastmap # define PyUnicode_DecodeLatin1 PyUnicodeUCS4_DecodeLatin1 # define PyUnicode_DecodeRawUnicodeEscape PyUnicodeUCS4_DecodeRawUnicodeEscape # define PyUnicode_DecodeUTF16 PyUnicodeUCS4_DecodeUTF16 @@ -243,6 +246,7 @@ # define PyUnicode_EncodeASCII PyUnicodeUCS4_EncodeASCII # define PyUnicode_EncodeCharmap PyUnicodeUCS4_EncodeCharmap # define PyUnicode_EncodeDecimal PyUnicodeUCS4_EncodeDecimal +# define PyUnicode_EncodeFastmap PyUnicodeUCS4_EncodeFastmap # define PyUnicode_EncodeLatin1 PyUnicodeUCS4_EncodeLatin1 # define PyUnicode_EncodeRawUnicodeEscape PyUnicodeUCS4_EncodeRawUnicodeEscape # define PyUnicode_EncodeUTF16 PyUnicodeUCS4_EncodeUTF16 @@ -915,6 +919,33 @@ const char *errors /* error handling */ ); +/* --- Fast Map Codecs ---------------------------------------------------- +XXX describe codec here +*/ + +PyAPI_FUNC(PyObject*) PyUnicode_DecodeFastmap( + const char *string, /* Encoded string */ + int length, /* size of string */ + const Py_UNICODE *table, /* Translate map */ + const char *errors /* error handling */ + ); + +typedef struct { + unsigned char high; + unsigned char lowfirst, lowlast; + unsigned char undefmark; + short mapindex; +} PyFastmap_EncodingIndex; + +PyAPI_FUNC(PyObject*) PyUnicode_EncodeFastmap( + const Py_UNICODE *data, /* Unicode char buffer */ + int length, /* Number of Py_UNICODE chars to encode */ + const unsigned char *table, /* Translate map */ + const PyFastmap_EncodingIndex *index, /* encoding map index */ + int indexsize, /* the index's size */ + const char *errors /* error handling */ + ); + #ifdef MS_WIN32 /* --- MBCS codecs for Windows -------------------------------------------- */ Index: Modules/_codecsmodule.c =================================================================== RCS file: /cvsroot/python/python/dist/src/Modules/_codecsmodule.c,v retrieving revision 2.22 diff -u -r2.22 _codecsmodule.c --- Modules/_codecsmodule.c 30 Aug 2005 10:23:14 -0000 2.22 +++ Modules/_codecsmodule.c 5 Oct 2005 14:39:34 -0000 @@ -832,6 +832,205 @@ } #endif /* MS_WINDOWS */ + + +/* --- Fastmap Object ----------------------------------------------------- */ + +static char *codeckwarglist[] = {"input", "errors", NULL}; +typedef struct { + PyObject_HEAD + Py_UNICODE *decoding_map; + unsigned char *encoding_map; + PyFastmap_EncodingIndex *encoding_index; + int encoding_index_size; +} FastmapCodecObject; + +#if SIZEOF_SHORT == 2 +typedef unsigned short fastmap_ucs2_t; +#else +#error fastmap is not support on this platform yet. +#endif + +static PyObject * +fastmap_decode(FastmapCodecObject *self, + PyObject *args, PyObject *kwds) +{ + const char *errors = NULL; + const char *data; + int size; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "t#|z:fastmap_decode", + codeckwarglist, &data, &size, &errors)) + return NULL; + + return codec_tuple(PyUnicode_DecodeFastmap(data, size, self->decoding_map, + errors), + size); +} + +static PyObject * +fastmap_encode(FastmapCodecObject *self, + PyObject *args, PyObject *kwds) +{ + PyObject *str, *v; + const char *errors = NULL; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|z:fastmap_encode", + codeckwarglist, &str, &errors)) + return NULL; + + str = PyUnicode_FromObject(str); + if (str == NULL) + return NULL; + v = codec_tuple(PyUnicode_EncodeFastmap( + PyUnicode_AS_UNICODE(str), + PyUnicode_GET_SIZE(str), + self->encoding_map, + self->encoding_index, + self->encoding_index_size, + errors), + PyUnicode_GET_SIZE(str)); + Py_DECREF(str); + return v; +} + +static struct PyMethodDef fastmap_methods[] = { + {"encode", (PyCFunction)fastmap_encode, METH_VARARGS | METH_KEYWORDS}, + {"decode", (PyCFunction)fastmap_decode, METH_VARARGS | METH_KEYWORDS}, + {NULL, NULL}, +}; + +static void +fastmap_dealloc(FastmapCodecObject *self) +{ + PyMem_Del(self->decoding_map); + PyMem_Del(self->encoding_map); + PyMem_Del(self->encoding_index); + PyObject_Del(self); +} + +static PyTypeObject FastmapCodec_Type = { + PyObject_HEAD_INIT(NULL) + 0, /* ob_size */ + "FastmapCodec", /* tp_name */ + sizeof(FastmapCodecObject), /* tp_basicsize */ + 0, /* tp_itemsize */ + /* methods */ + (destructor)fastmap_dealloc, /* tp_dealloc */ + 0, /* tp_print */ + 0, /* tp_getattr */ + 0, /* tp_setattr */ + 0, /* tp_compare */ + 0, /* tp_repr */ + 0, /* tp_as_number */ + 0, /* tp_as_sequence */ + 0, /* tp_as_mapping */ + 0, /* tp_hash */ + 0, /* tp_call */ + 0, /* tp_str */ + PyObject_GenericGetAttr, /* tp_getattro */ + 0, /* tp_setattro */ + 0, /* tp_as_buffer */ + Py_TPFLAGS_DEFAULT, /* tp_flags */ + 0, /* tp_doc */ + 0, /* tp_traverse */ + 0, /* tp_clear */ + 0, /* tp_richcompare */ + 0, /* tp_weaklistoffset */ + 0, /* tp_iter */ + 0, /* tp_iterext */ + fastmap_methods, /* tp_methods */ +}; + +#define EIDX_ELEM_SIZE (1+1+1+1+2) /* !BBBBH */ +static PyObject * +fastmap_codec(PyObject *ignore, PyObject *args) +{ + FastmapCodecObject *self; + Py_UNICODE *decoding_map = NULL; + unsigned char *encoding_map = NULL; + PyFastmap_EncodingIndex *encoding_index = NULL; + int encoding_index_size; + unsigned char *dmap, *emap, *eidx; + int dmapsize, emapsize, eidxsize; + int i; + + if (!PyArg_ParseTuple(args, "t#t#t#:fastmap_codec", &dmap, &dmapsize, + &emap, &emapsize, &eidx, &eidxsize)) + return NULL; + + if (dmapsize != sizeof(fastmap_ucs2_t) * 256) { + PyErr_Format(PyExc_ValueError, "fastmap requires a decoding " + "translation table of %d bytes", + (int)(sizeof(fastmap_ucs2_t) * 256)); + return NULL; + } + + if (eidxsize % EIDX_ELEM_SIZE) { + PyErr_Format(PyExc_ValueError, + "encoding index has some trailing data"); + return NULL; + } + + encoding_index_size = eidxsize / EIDX_ELEM_SIZE; + + encoding_index = PyMem_New(PyFastmap_EncodingIndex, encoding_index_size); + if (encoding_index == NULL) + return NULL; + + for (i = 0; i < encoding_index_size; i++) { + const unsigned char *eidxelem; + eidxelem = (unsigned char *)eidx + i*EIDX_ELEM_SIZE; + encoding_index[i].high = eidxelem[0]; + encoding_index[i].lowfirst = eidxelem[1]; + encoding_index[i].lowlast = eidxelem[2]; + encoding_index[i].undefmark = eidxelem[3]; + encoding_index[i].mapindex = (eidxelem[4]<<8) | eidxelem[5]; + + /* check whether the index may cause map overflows */ + if (encoding_index[i].mapindex + encoding_index[i].lowlast - + encoding_index[i].lowfirst >= emapsize) { + PyErr_Format(PyExc_ValueError, + "encoding index may cause overflow."); + goto onError; + } + } + + encoding_map = PyMem_New(unsigned char, emapsize); + if (encoding_map == NULL) + goto onError; + + memcpy(encoding_map, emap, emapsize); + + decoding_map = PyMem_New(Py_UNICODE, dmapsize); + if (decoding_map == NULL) + goto onError; + + for (i = 0; i < 256; i++) + decoding_map[i] = (Py_UNICODE)((dmap[i*2]<<8) | dmap[i*2 + 1]); + + self = PyObject_New(FastmapCodecObject, &FastmapCodec_Type); + if (self == NULL) + goto onError; + + self->decoding_map = decoding_map; + self->encoding_map = encoding_map; + self->encoding_index = encoding_index; + self->encoding_index_size = encoding_index_size; + + return (PyObject *)self; + + onError: + if (encoding_index != NULL) + PyMem_Del(encoding_index); + if (encoding_map != NULL) + PyMem_Del(encoding_map); + if (decoding_map != NULL) + PyMem_Del(decoding_map); + return NULL; +} +#undef EIDX_ELEM_SIZE + #endif /* Py_USING_UNICODE */ /* --- Error handler registry --------------------------------------------- */ @@ -918,6 +1117,7 @@ {"mbcs_encode", mbcs_encode, METH_VARARGS}, {"mbcs_decode", mbcs_decode, METH_VARARGS}, #endif + {"fastmap_codec", fastmap_codec, METH_VARARGS}, #endif /* Py_USING_UNICODE */ {"register_error", register_error, METH_VARARGS, register_error__doc__}, @@ -930,4 +1130,7 @@ init_codecs(void) { Py_InitModule("_codecs", _codecs_functions); + + if (PyType_Ready(&FastmapCodec_Type) < 0) + return; } Index: Tools/scripts/genfastcodec.py =================================================================== RCS file: Tools/scripts/genfastcodec.py diff -N Tools/scripts/genfastcodec.py --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ Tools/scripts/genfastcodec.py 5 Oct 2005 14:39:34 -0000 @@ -0,0 +1,163 @@ +import struct +import string + +UNICODE_REPLACEMENT = 0xFFFD +CODEC_TEMPLATE = string.Template("""\ +\"\"\" Python Fast Mapping Codec generated from XXX with genfastcodec.py. + +Written by Marc-Andre Lemburg (mal@lemburg.com). + +(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. +(c) Copyright 2000 Guido van Rossum. +\"\"\" + +import codecs + +### Decoding Map + +decoding_map = ( +$decoding_map) + +### Encoding Index + +encoding_index = ( +$encoding_index) + +### Encoding Map + +encoding_map = ( +$encoding_map) + +### Codec APIs + +_codec = codecs.fastmap_codec(decoding_map, encoding_map, encoding_index) + +class Codec(codecs.Codec): + + encode = _codec.encode + decode = _codec.decode + +class StreamWriter(Codec,codecs.StreamWriter): + pass + +class StreamReader(Codec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + + return (Codec().encode,Codec().decode,StreamReader,StreamWriter) +""")#" + +def generate_decoding_trans(decmap): + out = [] + for i in range(256): + uni = decmap.get(i, UNICODE_REPLACEMENT) + out.append(struct.pack('!H', uni)) + return ''.join(out) + +def generate_encoding_trans(encmap): + emapdigest = {} + + for i in range(256): + piecemap = {} + for j in range(256): + encoded = encmap.get((i<<8) | j) + if encoded is not None: + piecemap[j] = encoded + + if not piecemap: + continue + + # begin creating tight mapping + d = emapdigest[i] = {'high': i} + lowbytes = piecemap.keys() + d['lowfirst'] = min(lowbytes) + d['lowlast'] = max(lowbytes) + + used = set() + mapping = [] + + for low in range(d['lowfirst'], d['lowlast']+1): + encoded = piecemap.get(low, None) + mapping.append(encoded) + if encoded is not None: + used.add(encoded) + + for unusedchk in range(256): + if unusedchk not in used: + break + else: + if None in mapping: + raise ValueError, "can't get tight in this mapping" + unusedchk = None + + if unusedchk is None: + if d['lowfirst'] != 0 or d['lowlast'] != 255: + raise ValueError, "there's no hole and not full mapping" + d['unused'] = None + else: + d['unused'] = unusedchk + for k, value in enumerate(mapping): + if value is None: + mapping[k] = unusedchk + + d['mapping'] = mapping + + if 0 not in emapdigest: + # index for 0 must exist for improve lookup performance. + emapdigest[0] = { + 'high': 0, + 'lowfirst': 1, 'lowlast': 0, # no mapping + 'mapping': [], 'unused': 0, + } + + indexout = [] + wholemapping = [] + + # generate mapping part & mark index positions + for high in sorted(emapdigest.keys()): + emapdigest[high]['mapbegins'] = len(wholemapping) + wholemapping.extend(map(chr, emapdigest[high]['mapping'])) + + for high in sorted(emapdigest.keys()): + curpart = emapdigest[high] + if curpart['unused'] is None: + lowfirst = 0xff # mark as special full map + lowlast = 0x00 + unused = 0x00 + else: + lowfirst, lowlast = curpart['lowfirst'], curpart['lowlast'] + unused = curpart['unused'] + + indexout.append(struct.pack('!BBBBH', + curpart['high'], lowfirst, lowlast, unused, curpart['mapbegins'])) + + indexpart = ''.join(indexout) + mappart = ''.join(wholemapping) + + return indexpart, mappart + +def hexdumpstr(s, perline=16): + o = [] + for cur in range(0, len(s), perline): + v = ''.join('\\x%02x' % ord(c) for c in s[cur:cur+perline]) + o.append('"%s"\n' % v) + return ''.join(o) + +def gencodec(encoding): + mod = __import__('encodings.' + encoding) + mod = getattr(mod, encoding) + decmap = generate_decoding_trans(mod.decoding_map) + encidx, encmap = generate_encoding_trans(mod.encoding_map) + return CODEC_TEMPLATE.substitute( + encoding_map=hexdumpstr(encmap), + encoding_index=hexdumpstr(encidx), + decoding_map=hexdumpstr(decmap)) + +if __name__ == '__main__': + # just temporary usage :-) + # python genfastcodec.py iso8859_1 > Lib/encodings/iso8859_1.py + import sys + sys.stdout.write(gencodec(sys.argv[1]))