bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 … · python/cpython@e2e7ff0 (original) (raw)

4 files changed

lines changed

Original file line number	Diff line number	Diff line change
@@ -208,6 +208,19 @@ def test_issue10254(self):
208	208	b = 'C\u0338' * 20 + '\xC7'
209	209	self.assertEqual(self.db.normalize('NFC', a), b)
210	210
	211	+def test_issue29456(self):
	212	+# Fix #29456
	213	+u1176_str_a = '\u1100\u1176\u11a8'
	214	+u1176_str_b = '\u1100\u1176\u11a8'
	215	+u11a7_str_a = '\u1100\u1175\u11a7'
	216	+u11a7_str_b = '\uae30\u11a7'
	217	+u11c3_str_a = '\u1100\u1175\u11c3'
	218	+u11c3_str_b = '\uae30\u11c3'
	219	+self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
	220	+self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
	221	+self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
	222	+
	223	+
211	224	def test_east_asian_width(self):
212	225	eaw = self.db.east_asian_width
213	226	self.assertRaises(TypeError, eaw, b'a')

Original file line number	Diff line number	Diff line change
@@ -0,0 +1 @@
	1	+Fix bugs in hangul normalization: u1176, u11a7 and u11c3

Original file line number	Diff line number	Diff line change
@@ -681,15 +681,19 @@ nfc_nfkc(PyObject self, PyObject input, int k)
681	681	if (LBase <= code && code < (LBase+LCount) &&
682	682	i + 1 < len &&
683	683	VBase <= PyUnicode_READ(kind, data, i+1) &&
684		-PyUnicode_READ(kind, data, i+1) <= (VBase+VCount)) {
	684	+PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
	685	+/* check L character is a modern leading consonant (0x1100 ~ 0x1112)
	686	+ and V character is a modern vowel (0x1161 ~ 0x1175). */
685	687	int LIndex, VIndex;
686	688	LIndex = code - LBase;
687	689	VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
688	690	code = SBase + (LIndexVCount+VIndex)TCount;
689	691	i+=2;
690	692	if (i < len &&
691		-TBase <= PyUnicode_READ(kind, data, i) &&
692		-PyUnicode_READ(kind, data, i) <= (TBase+TCount)) {
	693	+TBase < PyUnicode_READ(kind, data, i) &&
	694	+PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
	695	+/* check T character is a modern trailing consonant
	696	+ (0x11A8 ~ 0x11C2). */
693	697	code += PyUnicode_READ(kind, data, i)-TBase;
694	698	i++;
695	699	}