bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 … · python/cpython@e2e7ff0 (original) (raw)
4 files changed
lines changed
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -208,6 +208,19 @@ def test_issue10254(self): | ||
208 | 208 | b = 'C\u0338' * 20 + '\xC7' |
209 | 209 | self.assertEqual(self.db.normalize('NFC', a), b) |
210 | 210 | |
211 | +def test_issue29456(self): | |
212 | +# Fix #29456 | |
213 | +u1176_str_a = '\u1100\u1176\u11a8' | |
214 | +u1176_str_b = '\u1100\u1176\u11a8' | |
215 | +u11a7_str_a = '\u1100\u1175\u11a7' | |
216 | +u11a7_str_b = '\uae30\u11a7' | |
217 | +u11c3_str_a = '\u1100\u1175\u11c3' | |
218 | +u11c3_str_b = '\uae30\u11c3' | |
219 | +self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b) | |
220 | +self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) | |
221 | +self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) | |
222 | + | |
223 | + | |
211 | 224 | def test_east_asian_width(self): |
212 | 225 | eaw = self.db.east_asian_width |
213 | 226 | self.assertRaises(TypeError, eaw, b'a') |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1742,6 +1742,7 @@ Jason Yeo | ||
1742 | 1742 | EungJun Yi |
1743 | 1743 | Bob Yodlowski |
1744 | 1744 | Danny Yoo |
1745 | +Wonsup Yoon | |
1745 | 1746 | Rory Yorke |
1746 | 1747 | George Yoshida |
1747 | 1748 | Kazuhiro Yoshida |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
1 | +Fix bugs in hangul normalization: u1176, u11a7 and u11c3 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -681,15 +681,19 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) | ||
681 | 681 | if (LBase <= code && code < (LBase+LCount) && |
682 | 682 | i + 1 < len && |
683 | 683 | VBase <= PyUnicode_READ(kind, data, i+1) && |
684 | -PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) { | |
684 | +PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) { | |
685 | +/* check L character is a modern leading consonant (0x1100 ~ 0x1112) | |
686 | + and V character is a modern vowel (0x1161 ~ 0x1175). */ | |
685 | 687 | int LIndex, VIndex; |
686 | 688 | LIndex = code - LBase; |
687 | 689 | VIndex = PyUnicode_READ(kind, data, i+1) - VBase; |
688 | 690 | code = SBase + (LIndex*VCount+VIndex)*TCount; |
689 | 691 | i+=2; |
690 | 692 | if (i < len && |
691 | -TBase <= PyUnicode_READ(kind, data, i) && | |
692 | -PyUnicode_READ(kind, data, i) <= (TBase+TCount)) { | |
693 | +TBase < PyUnicode_READ(kind, data, i) && | |
694 | +PyUnicode_READ(kind, data, i) < (TBase+TCount)) { | |
695 | +/* check T character is a modern trailing consonant | |
696 | + (0x11A8 ~ 0x11C2). */ | |
693 | 697 | code += PyUnicode_READ(kind, data, i)-TBase; |
694 | 698 | i++; |
695 | 699 | } |