Issue 9712: tokenize yield an ERRORTOKEN if the identifier starts with a non-ascii char (original) (raw)

from io import BytesIO from tokenize import tokenize, tok_name

sample = 'éléphants = "un éléphant, deux éléphants, ..."\nprint(éléphants)\n' sampleb = sample.encode('utf-8')

exec(sample)

output: un éléphant, deux éléphants, ...

exec(sampleb)

output: un éléphant, deux éléphants, ...

module = BytesIO() module.write(sampleb) module.seek(0)

for line in tokenize(module.readline): print(tok_name[line.type], line)

output:

ENCODING TokenInfo(type=57, string='utf-8', start=(0, 0), end=(0, 0), line='') ERRORTOKEN TokenInfo(type=54, string='é', start=(1, 0), end=(1, 1), line='éléphants = "un éléphant, deux éléphants, ..."\n') NAME TokenInfo(type=1, string='léphants', start=(1, 1), end=(1, 9), line='éléphants = "un éléphant, deux éléphants, ..."\n') OP TokenInfo(type=53, string='=', start=(1, 10), end=(1, 11), line='éléphants = "un éléphant, deux éléphants, ..."\n') STRING TokenInfo(type=3, string='"un éléphant, deux éléphants, ..."', start=(1, 12), end=(1, 46), line='éléphants = "un éléphant, deux éléphants, ..."\n') NEWLINE TokenInfo(type=4, string='\n', start=(1, 46), end=(1, 47), line='éléphants = "un éléphant, deux éléphants, ..."\n') NAME TokenInfo(type=1, string='print', start=(2, 0), end=(2, 5), line='print(éléphants)\n') OP TokenInfo(type=53, string='(', start=(2, 5), end=(2, 6), line='print(éléphants)\n') ERRORTOKEN TokenInfo(type=54, string='é', start=(2, 6), end=(2, 7), line='print(éléphants)\n') NAME TokenInfo(type=1, string='léphants', start=(2, 7), end=(2, 15), line='print(éléphants)\n') OP TokenInfo(type=53, string=')', start=(2, 15), end=(2, 16), line='print(éléphants)\n') NEWLINE TokenInfo(type=4, string='\n', start=(2, 16), end=(2, 17), line='print(éléphants)\n') ENDMARKER TokenInfo(type=0, string='', start=(3, 0), end=(3, 0), line='')