[3.6] bpo-33899: Make tokenize module mirror end-of-file is end-of-li… · python/cpython@11c36a3 (original) (raw)

1

1

`from test import support

`

2

2

`from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,

`

3

3

`STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,

`

4

``

`-

open as tokenize_open, Untokenizer)

`

``

4

`+

open as tokenize_open, Untokenizer, generate_tokens,

`

``

5

`+

NEWLINE)

`

5

6

`from io import BytesIO

`

6

7

`import unittest

`

7

8

`from unittest import TestCase, mock

`

`@@ -11,27 +12,51 @@

`

11

12

`import token

`

12

13

``

13

14

``

``

15

`+

Converts a source string into a list of textual representation

`

``

16

`+

of the tokens such as:

`

``

17

`` +

NAME 'if' (1, 0) (1, 2)

``

``

18

`+

to make writing tests easier.

`

``

19

`+

def stringify_tokens_from_source(token_generator, source_string):

`

``

20

`+

result = []

`

``

21

`+

num_lines = len(source_string.splitlines())

`

``

22

`+

missing_trailing_nl = source_string[-1] not in '\r\n'

`

``

23

+

``

24

`+

for type, token, start, end, line in token_generator:

`

``

25

`+

if type == ENDMARKER:

`

``

26

`+

break

`

``

27

`+

Ignore the new line on the last line if the input lacks one

`

``

28

`+

if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:

`

``

29

`+

continue

`

``

30

`+

type = tok_name[type]

`

``

31

`+

result.append(f" {type:10} {token!r:13} {start} {end}")

`

``

32

+

``

33

`+

return result

`

``

34

+

14

35

`class TokenizeTest(TestCase):

`

15

36

`# Tests for the tokenize module.

`

16

37

``

17

38

`# The tests can be really simple. Given a small fragment of source

`

18

``

`-

code, print out a table with tokens. The ENDMARKER is omitted for

`

19

``

`-

brevity.

`

``

39

`+

code, print out a table with tokens. The ENDMARKER, ENCODING and

`

``

40

`+

final NEWLINE are omitted for brevity.

`

20

41

``

21

42

`def check_tokenize(self, s, expected):

`

22

43

`# Format the tokens in s in a table format.

`

23

``

`-

The ENDMARKER is omitted.

`

24

``

`-

result = []

`

``

44

`+

The ENDMARKER and final NEWLINE are omitted.

`

25

45

`f = BytesIO(s.encode('utf-8'))

`

26

``

`-

for type, token, start, end, line in tokenize(f.readline):

`

27

``

`-

if type == ENDMARKER:

`

28

``

`-

break

`

29

``

`-

type = tok_name[type]

`

30

``

`-

result.append(f" {type:10} {token!r:13} {start} {end}")

`

``

46

`+

result = stringify_tokens_from_source(tokenize(f.readline), s)

`

``

47

+

31

48

`self.assertEqual(result,

`

32

49

` [" ENCODING 'utf-8' (0, 0) (0, 0)"] +

`

33

50

`expected.rstrip().splitlines())

`

34

51

``

``

52

`+

def test_implicit_newline(self):

`

``

53

`+

Make sure that the tokenizer puts in an implicit NEWLINE

`

``

54

`+

when the input lacks a trailing new line.

`

``

55

`+

f = BytesIO("x".encode('utf-8'))

`

``

56

`+

tokens = list(tokenize(f.readline))

`

``

57

`+

self.assertEqual(tokens[-2].type, NEWLINE)

`

``

58

`+

self.assertEqual(tokens[-1].type, ENDMARKER)

`

``

59

+

35

60

`def test_basic(self):

`

36

61

`self.check_tokenize("1 + 1", """\

`

37

62

` NUMBER '1' (1, 0) (1, 1)

`

`@@ -993,8 +1018,8 @@ def readline():

`

993

1018

`else:

`

994

1019

`return b''

`

995

1020

``

996

``

`-

skip the initial encoding token and the end token

`

997

``

`-

tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]

`

``

1021

`+

skip the initial encoding token and the end tokens

`

``

1022

`+

tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]

`

998

1023

`expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

`

999

1024

`self.assertEqual(tokens, expected_tokens,

`

1000

1025

`"bytes not decoded with encoding")

`

`@@ -1010,8 +1035,8 @@ def readline():

`

1010

1035

`else:

`

1011

1036

`return b''

`

1012

1037

``

1013

``

`-

skip the end token

`

1014

``

`-

tokens = list(_tokenize(readline, encoding=None))[:-1]

`

``

1038

`+

skip the end tokens

`

``

1039

`+

tokens = list(_tokenize(readline, encoding=None))[:-2]

`

1015

1040

`expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]

`

1016

1041

`self.assertEqual(tokens, expected_tokens,

`

1017

1042

`"string not tokenized when encoding is None")

`

`@@ -1322,18 +1347,21 @@ def test_oneline_defs(self):

`

1322

1347

``

1323

1348

`# Test that 500 consequent, one-line defs is OK

`

1324

1349

`toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))

`

1325

``

`-

self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER

`

``

1350

`+

self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER

`

``

1351

`+

[-2] is always NEWLINE

`

1326

1352

``

1327

1353

`def assertExactTypeEqual(self, opstr, *optypes):

`

1328

1354

`tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))

`

1329

1355

`num_optypes = len(optypes)

`

1330

``

`-

self.assertEqual(len(tokens), 2 + num_optypes)

`

``

1356

`+

self.assertEqual(len(tokens), 3 + num_optypes)

`

1331

1357

`self.assertEqual(token.tok_name[tokens[0].exact_type],

`

1332

1358

`token.tok_name[ENCODING])

`

1333

1359

`for i in range(num_optypes):

`

1334

1360

`self.assertEqual(token.tok_name[tokens[i + 1].exact_type],

`

1335

1361

`token.tok_name[optypes[i]])

`

1336

1362

`self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],

`

``

1363

`+

token.tok_name[token.NEWLINE])

`

``

1364

`+

self.assertEqual(token.tok_name[tokens[2 + num_optypes].exact_type],

`

1337

1365

`token.tok_name[token.ENDMARKER])

`

1338

1366

``

1339

1367

`def test_exact_type(self):

`

`@@ -1484,7 +1512,7 @@ def test_roundtrip(self):

`

1484

1512

`self.check_roundtrip("if x == 1:\n"

`

1485

1513

`" print(x)\n")

`

1486

1514

`self.check_roundtrip("# This is a comment\n"

`

1487

``

`-

"# This also")

`

``

1515

`+

"# This also\n")

`

1488

1516

``

1489

1517

`# Some people use different formatting conventions, which makes

`

1490

1518

`# untokenize a little trickier. Note that this test involves trailing

`