cpython: 9ce5c1f371f7 (original) (raw)

Mercurial > cpython

changeset 96712:9ce5c1f371f7 3.4

Issue #20387: Merge [#20387]

Jason R. Coombs jaraco@jaraco.com
date	Sun, 28 Jun 2015 11:10:29 -0400
parents	fd17e168b59f(current diff)330e28b28334(diff)
children	98380a6e037c c95d7ffa492e
files	Misc/NEWS
diffstat	3 files changed, 40 insertions(+), 1 deletions(-)[+] [-] Lib/test/test_tokenize.py 21 Lib/tokenize.py 17 Misc/NEWS 3

line wrap: on

line diff

--- a/Lib/test/test_tokenize.py +++ b/Lib/test/test_tokenize.py @@ -5,6 +5,8 @@ The tests can be really simple. Given a code, print out a table with tokens. The ENDMARKER is omitted for brevity.

import glob

+ >>> dump_tokens("1 + 1") ENCODING 'utf-8' (0, 0) (0, 0) NUMBER '1' (1, 0) (1, 1) @@ -647,7 +649,7 @@ from tokenize import (tokenize, _tokeniz open as tokenize_open, Untokenizer) from io import BytesIO from unittest import TestCase, mock -import os, sys, glob +import os import token def dump_tokens(s): @@ -1227,6 +1229,22 @@ class UntokenizeTest(TestCase): self.assertEqual(untokenize(iter(tokens)), b'Hello ') +class TestRoundtrip(TestCase):

def roundtrip(self, code):
```
   if isinstance(code, str):[](#l1.27)
```

       code = code.encode('utf-8')[](#l1.28)

   return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')[](#l1.29)

def test_indentation_semantics_retained(self):
```
   """[](#l1.32)
```

   Ensure that although whitespace might be mutated in a roundtrip,[](#l1.33)

   the semantic meaning of the indentation remains consistent.[](#l1.34)

```
   """[](#l1.35)
```

   code = "if False:\n\tx=3\n\tx=3\n"[](#l1.36)

   codelines = self.roundtrip(code).split('\n')[](#l1.37)

   self.assertEqual(codelines[1], codelines[2])[](#l1.38)

+ + test = {"doctests" : doctests, 'decistmt': decistmt} def test_main(): @@ -1237,6 +1255,7 @@ def test_main(): support.run_unittest(TestDetectEncoding) support.run_unittest(TestTokenize) support.run_unittest(UntokenizeTest)

support.run_unittest(TestRoundtrip)

if name == "main": test_main()

--- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -244,6 +244,8 @@ class Untokenizer: def untokenize(self, iterable): it = iter(iterable)

```
   indents = [][](#l2.7)
```

   startline = False[](#l2.8)
   for t in it:[](#l2.9)
       if len(t) == 2:[](#l2.10)
           self.compat(t, it)[](#l2.11)

@@ -254,6 +256,21 @@ class Untokenizer: continue if tok_type == ENDMARKER: break

```
       if tok_type == INDENT:[](#l2.16)
```

           indents.append(token)[](#l2.17)

```
           continue[](#l2.18)
```

       elif tok_type == DEDENT:[](#l2.19)

```
           indents.pop()[](#l2.20)
```

           self.prev_row, self.prev_col = end[](#l2.21)

```
           continue[](#l2.22)
```

       elif tok_type in (NEWLINE, NL):[](#l2.23)

```
           startline = True[](#l2.24)
```

       elif startline and indents:[](#l2.25)

           indent = indents[-1][](#l2.26)

           if start[1] >= len(indent):[](#l2.27)

               self.tokens.append(indent)[](#l2.28)

               self.prev_col = len(indent)[](#l2.29)

           startline = False[](#l2.30)
       self.add_whitespace(start)[](#l2.31)
       self.tokens.append(token)[](#l2.32)
       self.prev_row, self.prev_col = end[](#l2.33)

--- a/Misc/NEWS +++ b/Misc/NEWS @@ -60,6 +60,9 @@ Core and Builtins Library ------- +- Issue #20387: Restore semantic round-trip correctness in tokenize/untokenize

for tab-indented blocks. +

Issue #24456: Fixed possible buffer over-read in adpcm2lin() and lin2adpcm() functions of the audioop module.