bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 … · python/cpython@e2e7ff0 (original) (raw)

4 files changed

lines changed

Original file line number Diff line number Diff line change
@@ -208,6 +208,19 @@ def test_issue10254(self):
208 208 b = 'C\u0338' * 20 + '\xC7'
209 209 self.assertEqual(self.db.normalize('NFC', a), b)
210 210
211 +def test_issue29456(self):
212 +# Fix #29456
213 +u1176_str_a = '\u1100\u1176\u11a8'
214 +u1176_str_b = '\u1100\u1176\u11a8'
215 +u11a7_str_a = '\u1100\u1175\u11a7'
216 +u11a7_str_b = '\uae30\u11a7'
217 +u11c3_str_a = '\u1100\u1175\u11c3'
218 +u11c3_str_b = '\uae30\u11c3'
219 +self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
220 +self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
221 +self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
222 +
223 +
211 224 def test_east_asian_width(self):
212 225 eaw = self.db.east_asian_width
213 226 self.assertRaises(TypeError, eaw, b'a')
Original file line number Diff line number Diff line change
@@ -1742,6 +1742,7 @@ Jason Yeo
1742 1742 EungJun Yi
1743 1743 Bob Yodlowski
1744 1744 Danny Yoo
1745 +Wonsup Yoon
1745 1746 Rory Yorke
1746 1747 George Yoshida
1747 1748 Kazuhiro Yoshida
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1 +Fix bugs in hangul normalization: u1176, u11a7 and u11c3
Original file line number Diff line number Diff line change
@@ -681,15 +681,19 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
681 681 if (LBase <= code && code < (LBase+LCount) &&
682 682 i + 1 < len &&
683 683 VBase <= PyUnicode_READ(kind, data, i+1) &&
684 -PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
684 +PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
685 +/* check L character is a modern leading consonant (0x1100 ~ 0x1112)
686 + and V character is a modern vowel (0x1161 ~ 0x1175). */
685 687 int LIndex, VIndex;
686 688 LIndex = code - LBase;
687 689 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
688 690 code = SBase + (LIndex*VCount+VIndex)*TCount;
689 691 i+=2;
690 692 if (i < len &&
691 -TBase <= PyUnicode_READ(kind, data, i) &&
692 -PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
693 +TBase < PyUnicode_READ(kind, data, i) &&
694 +PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
695 +/* check T character is a modern trailing consonant
696 + (0x11A8 ~ 0x11C2). */
693 697 code += PyUnicode_READ(kind, data, i)-TBase;
694 698 i++;
695 699 }