cpython: dd67c8c53aea (original) (raw)
Mercurial > cpython
changeset 99484:dd67c8c53aea
Issue #25638: Optimized ElementTree.iterparse(); it is now 2x faster. ElementTree.XMLParser._setevents now accepts any objects with the append method, not just a list. [#25638]
Serhiy Storchaka storchaka@gmail.com | |
---|---|
date | Mon, 07 Dec 2015 02:31:11 +0200 |
parents | 2cf16918b632 |
children | b4aeb35ab7e1 |
files | Lib/xml/etree/ElementTree.py Misc/NEWS Modules/_elementtree.c Modules/clinic/_elementtree.c.h |
diffstat | 4 files changed, 56 insertions(+), 80 deletions(-)[+] [-] Lib/xml/etree/ElementTree.py 92 Misc/NEWS 2 Modules/_elementtree.c 35 Modules/clinic/_elementtree.c.h 7 |
line wrap: on
line diff
--- a/Lib/xml/etree/ElementTree.py +++ b/Lib/xml/etree/ElementTree.py @@ -95,6 +95,7 @@ import sys import re import warnings import io +import collections import contextlib from . import ElementPath @@ -1198,16 +1199,37 @@ def iterparse(source, events=None, parse Returns an iterator providing (event, elem) pairs. """
Use the internal, undocumented _parser argument for now; When the
parser argument of iterparse is removed, this can be killed.
- pullparser = XMLPullParser(events=events, _parser=parser)
- def iterator():
try:[](#l1.19)
while True:[](#l1.20)
yield from pullparser.read_events()[](#l1.21)
# load event buffer[](#l1.22)
data = source.read(16 * 1024)[](#l1.23)
if not data:[](#l1.24)
break[](#l1.25)
pullparser.feed(data)[](#l1.26)
root = pullparser._close_and_return_root()[](#l1.27)
yield from pullparser.read_events()[](#l1.28)
it.root = root[](#l1.29)
finally:[](#l1.30)
if close_source:[](#l1.31)
source.close()[](#l1.32)
- class IterParseIterator(collections.Iterator):
__next__ = iterator().__next__[](#l1.35)
- it = IterParseIterator()
- it.root = None
- del iterator, IterParseIterator
+ close_source = False if not hasattr(source, "read"): source = open(source, "rb") close_source = True
- try:
return _IterParseIterator(source, events, parser, close_source)[](#l1.45)
- except:
if close_source:[](#l1.47)
source.close()[](#l1.48)
raise[](#l1.49)
class XMLPullParser: @@ -1217,9 +1239,7 @@ class XMLPullParser: # upon in user code. It will be removed in a future release. # See http://bugs.python.org/issue17741 for more details.
# _elementtree.c expects a list, not a deque[](#l1.59)
self._events_queue = [][](#l1.60)
self._index = 0[](#l1.61)
self._events_queue = collections.deque()[](#l1.62) self._parser = _parser or XMLParser(target=TreeBuilder())[](#l1.63) # wire up the parser for event reporting[](#l1.64) if events is None:[](#l1.65)
@@ -1257,64 +1277,14 @@ class XMLPullParser: retrieved from the iterator. """ events = self._events_queue
while True:[](#l1.70)
index = self._index[](#l1.71)
try:[](#l1.72)
event = events[self._index][](#l1.73)
# Avoid retaining references to past events[](#l1.74)
events[self._index] = None[](#l1.75)
except IndexError:[](#l1.76)
break[](#l1.77)
index += 1[](#l1.78)
# Compact the list in a O(1) amortized fashion[](#l1.79)
# As noted above, _elementree.c needs a list, not a deque[](#l1.80)
if index * 2 >= len(events):[](#l1.81)
events[:index] = [][](#l1.82)
self._index = 0[](#l1.83)
else:[](#l1.84)
self._index = index[](#l1.85)
while events:[](#l1.86)
event = events.popleft()[](#l1.87) if isinstance(event, Exception):[](#l1.88) raise event[](#l1.89) else:[](#l1.90) yield event[](#l1.91)
- def init(self, source, events, parser, close_source=False):
# Use the internal, undocumented _parser argument for now; When the[](#l1.97)
# parser argument of iterparse is removed, this can be killed.[](#l1.98)
self._parser = XMLPullParser(events=events, _parser=parser)[](#l1.99)
self._file = source[](#l1.100)
self._close_file = close_source[](#l1.101)
self.root = self._root = None[](#l1.102)
- def next(self):
try:[](#l1.105)
while 1:[](#l1.106)
for event in self._parser.read_events():[](#l1.107)
return event[](#l1.108)
if self._parser._parser is None:[](#l1.109)
break[](#l1.110)
# load event buffer[](#l1.111)
data = self._file.read(16 * 1024)[](#l1.112)
if data:[](#l1.113)
self._parser.feed(data)[](#l1.114)
else:[](#l1.115)
self._root = self._parser._close_and_return_root()[](#l1.116)
self.root = self._root[](#l1.117)
except:[](#l1.118)
if self._close_file:[](#l1.119)
self._file.close()[](#l1.120)
raise[](#l1.121)
if self._close_file:[](#l1.122)
self._file.close()[](#l1.123)
raise StopIteration[](#l1.124)
- - def XML(text, parser=None): """Parse XML document from string constant.
--- a/Misc/NEWS +++ b/Misc/NEWS @@ -109,6 +109,8 @@ Core and Builtins Library ------- +- Issue #25638: Optimized ElementTree.iterparse(); it is now 2x faster. +
- Issue #25761: Improved detecting errors in broken pickle data.
- Issue #25717: Restore the previous behaviour of tolerating most fstat()
--- a/Modules/_elementtree.c +++ b/Modules/_elementtree.c @@ -2289,7 +2289,7 @@ typedef struct { PyObject element_factory; / element tracing */
- PyObject events_append; / the append method of the list of events, or NULL */ PyObject start_event_obj; / event objects (NULL to ignore) */ PyObject *end_event_obj; PyObject *start_ns_event_obj; @@ -2324,7 +2324,7 @@ treebuilder_new(PyTypeObject *type, PyOb } t->index = 0;
t->events = NULL;[](#l3.16)
} @@ -2374,7 +2374,7 @@ treebuilder_gc_clear(TreeBuilderObject * Py_CLEAR(self->start_ns_event_obj); Py_CLEAR(self->end_event_obj); Py_CLEAR(self->start_event_obj);t->events_append = NULL;[](#l3.17) t->start_event_obj = t->end_event_obj = NULL;[](#l3.18) t->start_ns_event_obj = t->end_ns_event_obj = NULL;[](#l3.19)
- Py_CLEAR(self->events_append); Py_CLEAR(self->stack); Py_CLEAR(self->data); Py_CLEAR(self->last); @@ -2455,13 +2455,14 @@ treebuilder_append_event(TreeBuilderObje PyObject *node)
PyObject *res = PyTuple_Pack(2, action, node);[](#l3.34)
PyObject *res;[](#l3.35)
PyObject *event = PyTuple_Pack(2, action, node);[](#l3.36)
if (event == NULL)[](#l3.37)
return -1;[](#l3.38)
res = PyObject_CallFunctionObjArgs(self->events_append, event, NULL);[](#l3.39)
Py_DECREF(event);[](#l3.40) if (res == NULL)[](#l3.41) return -1;[](#l3.42)
if (PyList_Append(self->events, res) < 0) {[](#l3.43)
Py_DECREF(res);[](#l3.44)
return -1;[](#l3.45)
} return 0; @@ -3039,7 +3040,7 @@ expat_start_ns_handler(XMLParserObject* if (PyErr_Occurred()) return;}[](#l3.46) Py_DECREF(res);[](#l3.47)
if (!uri) @@ -3062,7 +3063,7 @@ expat_end_ns_handler(XMLParserObject* se if (PyErr_Occurred()) return;
treebuilder_append_event(target, target->end_ns_event_obj, Py_None); @@ -3551,7 +3552,7 @@ static PyObject /[clinic input] _elementtree.XMLParser._setevents
- events_queue: object events_to_report: object = None / @@ -3561,12 +3562,12 @@ static PyObject * _elementtree_XMLParser__setevents_impl(XMLParserObject *self, PyObject *events_queue, PyObject *events_to_report)
-/[clinic end generated code: output=1440092922b13ed1 input=59db9742910c6174]/ +/[clinic end generated code: output=1440092922b13ed1 input=abf90830a1c3b0fc]/ { /* activate element event reporting */ Py_ssize_t i, seqlen; TreeBuilderObject *target;
if (!TreeBuilder_CheckExact(self->target)) { PyErr_SetString( @@ -3579,9 +3580,11 @@ static PyObject target = (TreeBuilderObject) self->target;
- events_append = PyObject_GetAttrString(events_queue, "append");
- if (events_append == NULL)
return NULL;[](#l3.101)
- Py_XDECREF(target->events_append);
- target->events_append = events_append;
/* clear out existing events */ Py_CLEAR(target->start_event_obj);
--- a/Modules/clinic/_elementtree.c.h +++ b/Modules/clinic/_elementtree.c.h @@ -668,12 +668,13 @@ static PyObject * PyObject *events_queue; PyObject *events_to_report = Py_None;
- if (!PyArg_ParseTuple(args, "O!|O:_setevents",
&PyList_Type, &events_queue, &events_to_report))[](#l4.8)
- if (!PyArg_UnpackTuple(args, "_setevents",
1, 2,[](#l4.10)
return_value = _elementtree_XMLParser__setevents_impl(self, events_queue, events_to_report); exit: return return_value; } -/[clinic end generated code: output=25b8bf7e7f2151ca input=a9049054013a1b77]/ +/[clinic end generated code: output=19d94e2d2726d3aa input=a9049054013a1b77]/&events_queue, &events_to_report))[](#l4.11) goto exit;[](#l4.12)