bpo-31170: Write unit test for Expat 2.2.4 UTF-8 bug (#3570) (#3745) · python/cpython@5f5da72 (original) (raw)

2 files changed

lines changed

Original file line number Diff line number Diff line change
@@ -30,6 +30,7 @@
30 30
31 31 SIMPLE_XMLFILE = findfile("simple.xml", subdir="xmltestdata")
32 32 SIMPLE_NS_XMLFILE = findfile("simple-ns.xml", subdir="xmltestdata")
33 +UTF8_BUG_XMLFILE = findfile("expat224_utf8_bug.xml", subdir="xmltestdata")
33 34
34 35 SAMPLE_XML = """\
35 36
@@ -1494,6 +1495,36 @@ def test_issue10777(self):
1494 1495 ET.register_namespace('test10777', 'http://myuri/')
1495 1496 ET.register_namespace('test10777', 'http://myuri/')
1496 1497
1498 +def check_expat224_utf8_bug(self, text):
1499 +xml = b'' % text
1500 +root = ET.XML(xml)
1501 +self.assertEqual(root.get('b'), text.decode('utf-8'))
1502 +
1503 +def test_expat224_utf8_bug(self):
1504 +# bpo-31170: Expat 2.2.3 had a bug in its UTF-8 decoder.
1505 +# Check that Expat 2.2.4 fixed the bug.
1506 +#
1507 +# Test buffer bounds at odd and even positions.
1508 +
1509 +text = b'\xc3\xa0' * 1024
1510 +self.check_expat224_utf8_bug(text)
1511 +
1512 +text = b'x' + b'\xc3\xa0' * 1024
1513 +self.check_expat224_utf8_bug(text)
1514 +
1515 +def test_expat224_utf8_bug_file(self):
1516 +with open(UTF8_BUG_XMLFILE, 'rb') as fp:
1517 +raw = fp.read()
1518 +root = ET.fromstring(raw)
1519 +xmlattr = root.get('b')
1520 +
1521 +# "Parse" manually the XML file to extract the value of the 'b'
1522 +# attribute of the XML element
1523 +text = raw.decode('utf-8').strip()
1524 +text = text.replace('\r\n', ' ')
1525 +text = text[6:-4]
1526 +self.assertEqual(root.get('b'), text)
1527 +
1497 1528
1498 1529 # --------------------------------------------------------------------
1499 1530
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
1 +<a b='01234567890123456古人咏雪抽幽思骋妍辞竞险韵偶得一编奇绝辄擅美当时流声后代是以北门之风南山之雅梁园之简黄台之赋至今为作家称述尚矣及至洛阳之卧剡溪之兴灞桥之思亦皆传为故事钱塘沈履德先生隐居西湖两峰间孤高贞洁与雪同调方大雪满天皴肤粟背之际先生乃鹿中豹舄端居闭门或扶童曳杖踏遍六桥三竺时取古人诗讽咏之合唐宋元诸名家集句成诗得二百四十章联络通穿如出一人如呵一气气立于言表格备于篇中略无掇拾补凑之形非胸次包罗壮阔笔底驱走鲍谢欧苏诸公不能为此世称王荆公为集句擅长观其在钟山对雪仅题数篇未见有此噫嘻奇矣哉亦富矣哉予慕先生有袁安之节愧不能为慧可之立乃取新集命工传写使海内同好者知先生为博古传述之士而一新世人之耳目他日必有慕潜德阐幽光而剞劂以传者余实为之执殳矣
2 +弘治戊午仲冬望日慈溪杨子器衵于海虞官舍序毕诗部' />