Issue 12266: str.capitalize contradicts oneself (original) (raw)

import sys; hex(sys.maxunicode) '0x10ffff' import unicodedata; unicodedata.unidata_version '6.0.0'

import unicodedata all_chars = list(map(chr, range(0x110000))) Ll = [c for c in all_chars if unicodedata.category(c) == 'Ll'] Lu = [c for c in all_chars if unicodedata.category(c) == 'Lu'] Lt = [c for c in all_chars if unicodedata.category(c) == 'Lt'] Lo = [c for c in all_chars if unicodedata.category(c) == 'Lo'] Lm = [c for c in all_chars if unicodedata.category(c) == 'Lm']

[len(x) for x in [Ll, Lu, Lt, Lo, Lm]] [1759, 1436, 31, 97084, 210] sum(1 for c in Lu if c.lower() == c) 471 # uppercase chars with no lower sum(1 for c in Lt if c.lower() == c) 0 # titlecase chars with no lower sum(1 for c in Ll if c.upper() == c) 760 # lowercase chars with no upper sum(1 for c in Lo if c.upper() != c or c.title() != c or c.lower() != c) 0 # "Letter, other" chars with a different upper/title/lower case sum(1 for c in Lm if c.upper() != c or c.title() != c or c.lower() != c) 0 # "Letter, modifier" chars with a different upper/title/lower case sum(1 for c in all_chars if c not in L and (c.upper() != c or c.title() != c or c.lower() != c)) 85 # non-letter chars with a different upper/title/lower case [c for c in all_chars if c not in L and (c.upper() != c or c.title() != c or c.lower() != c)] ['', 'Ⅰ', 'Ⅱ', 'Ⅲ', 'Ⅳ', 'Ⅴ', 'Ⅵ', 'Ⅶ', 'Ⅷ', 'Ⅸ', 'Ⅹ', 'Ⅺ', 'Ⅻ', 'Ⅼ', 'Ⅽ', 'Ⅾ', 'Ⅿ', 'ⅰ', 'ⅱ', 'ⅲ', 'ⅳ', 'ⅴ', 'ⅵ', 'ⅶ', 'ⅷ', 'ⅸ', 'ⅹ', 'ⅺ', 'ⅻ', 'ⅼ', 'ⅽ', 'ⅾ', 'ⅿ', 'Ⓐ', 'Ⓑ', 'Ⓒ', 'Ⓓ', 'Ⓔ', 'Ⓕ', 'Ⓖ', 'Ⓗ', 'Ⓘ', 'Ⓙ', 'Ⓚ', 'Ⓛ', 'Ⓜ', 'Ⓝ', 'Ⓞ', 'Ⓟ', 'Ⓠ', 'Ⓡ', 'Ⓢ', 'Ⓣ', 'Ⓤ', 'Ⓥ', 'Ⓦ', 'Ⓧ', 'Ⓨ', 'Ⓩ', 'ⓐ', 'ⓑ', 'ⓒ', 'ⓓ', 'ⓔ', 'ⓕ', 'ⓖ', 'ⓗ', 'ⓘ', 'ⓙ', 'ⓚ', 'ⓛ', 'ⓜ', 'ⓝ', 'ⓞ', 'ⓟ', 'ⓠ', 'ⓡ', 'ⓢ', 'ⓣ', 'ⓤ', 'ⓥ', 'ⓦ', 'ⓧ', 'ⓨ', 'ⓩ'] list(c.lower() for c in ) ['', 'ⅰ', 'ⅱ', 'ⅲ', 'ⅳ', 'ⅴ', 'ⅵ', 'ⅶ', 'ⅷ', 'ⅸ', 'ⅹ', 'ⅺ', 'ⅻ', 'ⅼ', 'ⅽ', 'ⅾ', 'ⅿ', 'ⅰ', 'ⅱ', 'ⅲ', 'ⅳ', 'ⅴ', 'ⅵ', 'ⅶ', 'ⅷ', 'ⅸ', 'ⅹ', 'ⅺ', 'ⅻ', 'ⅼ', 'ⅽ', 'ⅾ', 'ⅿ', 'ⓐ', 'ⓑ', 'ⓒ', 'ⓓ', 'ⓔ', 'ⓕ', 'ⓖ', 'ⓗ', 'ⓘ', 'ⓙ', 'ⓚ', 'ⓛ', 'ⓜ', 'ⓝ', 'ⓞ', 'ⓟ', 'ⓠ', 'ⓡ', 'ⓢ', 'ⓣ', 'ⓤ', 'ⓥ', 'ⓦ', 'ⓧ', 'ⓨ', 'ⓩ', 'ⓐ', 'ⓑ', 'ⓒ', 'ⓓ', 'ⓔ', 'ⓕ', 'ⓖ', 'ⓗ', 'ⓘ', 'ⓙ', 'ⓚ', 'ⓛ', 'ⓜ', 'ⓝ', 'ⓞ', 'ⓟ', 'ⓠ', 'ⓡ', 'ⓢ', 'ⓣ', 'ⓤ', 'ⓥ', 'ⓦ', 'ⓧ', 'ⓨ', 'ⓩ'] len() 85 {unicodedata.category(c) for c in all_chars if c not in L and (c.upper() != c or c.title() != c or c.lower() != c)} {'So', 'Mn', 'Nl'}

So == Symbol, Other Mn == Mark, Nonspacing Nl == Number, Letter