(original) (raw)

Line

Count

Source (jump to first uncovered line)

1

2

/* Tokenizer implementation */

3

4

#define PY_SSIZE_T_CLEAN

5

#include "Python.h"

6

#include "pycore_call.h" // _PyObject_CallNoArgs()

7

8

#include <ctype.h>

9

#include <assert.h>

10

11

#include "tokenizer.h"

12

#include "errcode.h"

13

14

#include "unicodeobject.h"

15

#include "bytesobject.h"

16

#include "fileobject.h"

17

#include "abstract.h"

18

19

/* Alternate tab spacing */

20

#define ALTTABSIZE 1

21

22

#define is_potential_identifier_start(c) (\

23

          (c >= 'a' && 

c <= 'z'2.10M

)\

24

           || 

(7.88M

c >= 'A'7.88M

&&

c <= 'Z'810k

)\

25

           || 

c == '_'7.81M

\

26

           || 

(c >= 128)7.75M

)

27

28

#define is_potential_identifier_char(c) (\

29

          (c >= 'a' && 

c <= 'z'5.12M

)\

30

           || 

(3.43M

c >= 'A'3.43M

&&

c <= 'Z'873k

)\

31

           || 

(3.13M

c >= '0'3.13M

&&

c <= '9'944k

)\

32

           || 

c == '_'3.09M

\

33

           || 

(c >= 128)2.86M

)

34

35

36

/* Don't ever change this -- it would break the portability of Python code */

37

#define TABSIZE 8

38

39

/* Forward */

40

static struct tok_state *tok_new(void);

41

static int tok_nextc(struct tok_state *tok);

42

static void tok_backup(struct tok_state *tok, int c);

43

static int syntaxerror(struct tok_state *tok, const char *format, ...);

44

45

/* Spaces in this constant are treated as "zero or more spaces or tabs" when

46

tokenizing. */

47

static const char* type_comment_prefix = "# type: ";

48

49

/* Create and initialize a new tok_state structure */

50

51

static struct tok_state *

52

tok_new(void)

53

{

54

struct tok_state *tok = (struct tok_state *)PyMem_Malloc(

55

                                        sizeof(struct tok_state));

56

if (tok == NULL)

Branch (56:9): [True: 0, False: 132k]

57

    return NULL;

58

tok->buf = tok->cur = tok->inp = NULL;

59

tok->fp_interactive = 0;

60

tok->interactive_src_start = NULL;

61

tok->interactive_src_end = NULL;

62

tok->start = NULL;

63

tok->end = NULL;

64

tok->done = E_OK;

65

tok->fp = NULL;

66

tok->input = NULL;

67

tok->tabsize = TABSIZE;

68

tok->indent = 0;

69

tok->indstack[0] = 0;

70

tok->atbol = 1;

71

tok->pendin = 0;

72

tok->prompt = tok->nextprompt = NULL;

73

tok->lineno = 0;

74

tok->level = 0;

75

tok->altindstack[0] = 0;

76

tok->decoding_state = STATE_INIT;

77

tok->decoding_erred = 0;

78

tok->enc = NULL;

79

tok->encoding = NULL;

80

tok->cont_line = 0;

81

tok->filename = NULL;

82

tok->decoding_readline = NULL;

83

tok->decoding_buffer = NULL;

84

tok->type_comments = 0;

85

tok->async_hacks = 0;

86

tok->async_def = 0;

87

tok->async_def_indent = 0;

88

tok->async_def_nl = 0;

89

tok->interactive_underflow = IUNDERFLOW_NORMAL;

90

tok->str = NULL;

91

#ifdef Py_DEBUG

92

tok->debug = _Py_GetConfig()->parser_debug;

93

#endif

94

return tok;

95

}

96

97

static char *

98

new_string(const char *s, Py_ssize_t len, struct tok_state *tok)

99

{

100

char* result = (char *)PyMem_Malloc(len + 1);

101

if (!result) {

Branch (101:9): [True: 0, False: 57.1k]

102

    tok->done = E_NOMEM;

103

    return NULL;

104

}

105

memcpy(result, s, len);

106

result[len] = '\0';

107

return result;

108

}

109

110

static char *

111

error_ret(struct tok_state tok) / XXX */

112

{

113

tok->decoding_erred = 1;

114

if (tok->fp != NULL && 

tok->buf != NULL0

) /* see _PyTokenizer_Free */

Branch (114:9): [True: 0, False: 26] Branch (114:28): [True: 0, False: 0]

115

    PyMem_Free(tok->buf);

116

tok->buf = tok->cur = tok->inp = NULL;

117

tok->start = NULL;

118

tok->end = NULL;

119

tok->done = E_DECODE;

120

return NULL;                /* as if it were EOF */

121

}

122

123

124

static const char *

125

get_normal_name(const char s) / for utf-8 and latin-1 */

126

{

127

char buf[13];

128

int i;

129

for (i = 0; i < 12; 

i++489

) {

Branch (129:17): [True: 566, False: 0]

130

    int c = s[i];

131

    if (c == '\0')

Branch (131:13): [True: 77, False: 489]

132

        break;

133

    else if (c == '_')

Branch (133:18): [True: 0, False: 489]

134

        buf[i] = '-';

135

    else

136

        buf[i] = tolower(c);

137

}

138

buf[i] = '\0';

139

if (strcmp(buf, "utf-8") == 0 ||

Branch (139:9): [True: 14, False: 63]

140

    strncmp(buf, "utf-8-", 6) == 063

)

Branch (140:9): [True: 0, False: 63]

141

    return "utf-8";

142

else if (strcmp(buf, "latin-1") == 0 ||

Branch (142:14): [True: 14, False: 49]

143

strcmp(buf, "iso-8859-1") == 049

||

Branch (143:14): [True: 2, False: 47]

144

strcmp(buf, "iso-latin-1") == 047

||

Branch (144:14): [True: 0, False: 47]

145

strncmp(buf, "latin-1-", 8) == 047

||

Branch (145:14): [True: 0, False: 47]

146

strncmp(buf, "iso-8859-1-", 11) == 047

||

Branch (146:14): [True: 0, False: 47]

147

strncmp(buf, "iso-latin-1-", 12) == 047

)

Branch (147:14): [True: 0, False: 47]

148

    return "iso-8859-1";

149

else

150

    return s;

151

}

152

153

/* Return the coding spec in S, or NULL if none is found. */

154

155

static int

156

get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)

157

{

158

Py_ssize_t i;

159

*spec = NULL;

160

/* Coding spec must be in a comment, and that comment must be

161

 * the only statement on the source code line. */

162

for (i = 0; i < size - 6; 

i++0

) {

Branch (162:17): [True: 5.04k, False: 70.9k]

163

    if (s[i] == '#')

Branch (163:13): [True: 336, False: 4.70k]

164

        break;

165

    if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')

Branch (165:13): [True: 4.70k, False: 0] Branch (165:28): [True: 4.70k, False: 0] Branch (165:44): [True: 4.70k, False: 0]

166

        return 1;

167

}

168

for (; 71.2k

i < size - 6;

i++10.1k

) { /* XXX inefficient search */

Branch (168:12): [True: 10.2k, False: 71.1k]

169

    const char* t = s + i;

170

    if (memcmp(t, "coding", 6) == 0) {

Branch (170:13): [True: 77, False: 10.1k]

171

        const char* begin = NULL;

172

        t += 6;

173

        if (t[0] != ':' && 

t[0] != '='22

)

Branch (173:17): [True: 22, False: 55] Branch (173:32): [True: 0, False: 22]

174

            continue;

175

do 77

{

176

            t++;

177

        } while (t[0] == ' ' || 

t[0] == '\t'77

);

Branch (177:22): [True: 46, False: 77] Branch (177:37): [True: 0, False: 77]

178

179

        begin = t;

180

        while (Py_ISALNUM(t[0]) ||

181

t[0] == '-'133

||

t[0] == '_'77

||

t[0] == '.'77

)

Branch (181:20): [True: 56, False: 77] Branch (181:35): [True: 0, False: 77] Branch (181:50): [True: 0, False: 77]

182

            t++;

183

184

        if (begin < t) {

Branch (184:17): [True: 77, False: 0]

185

            char* r = new_string(begin, t - begin, tok);

186

            const char* q;

187

            if (!r)

Branch (187:21): [True: 0, False: 77]

188

                return 0;

189

            q = get_normal_name(r);

190

            if (r != q) {

Branch (190:21): [True: 30, False: 47]

191

                PyMem_Free(r);

192

                r = new_string(q, strlen(q), tok);

193

                if (!r)

Branch (193:25): [True: 0, False: 30]

194

                    return 0;

195

            }

196

            *spec = r;

197

            break;

198

        }

199

    }

200

}

201

return 1;

202

}

203

204

/* Check whether the line contains a coding spec. If it does,

205

invoke the set_readline function for the new encoding.

206

This function receives the tok_state and the new encoding.

207

Return 1 on success, 0 on failure. */

208

209

static int

210

check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,

211

              int set_readline(struct tok_state *, const char *))

212

{

213

char *cs;

214

if (tok->cont_line) {

Branch (214:9): [True: 0, False: 75.9k]

215

    /* It's a continuation line, so it can't be a coding spec. */

216

    tok->decoding_state = STATE_NORMAL;

217

    return 1;

218

}

219

if (!get_coding_spec(line, &cs, size, tok)) {

Branch (219:9): [True: 0, False: 75.9k]

220

    return 0;

221

}

222

if (!cs) {

Branch (222:9): [True: 75.9k, False: 77]

223

    Py_ssize_t i;

224

    for (i = 0; i < size; 

i++0

) {

Branch (224:21): [True: 75.5k, False: 344]

225

        if (line[i] == '#' || 

line[i] == '\n'75.2k

||

line[i] == '\r'75.1k

)

Branch (225:17): [True: 290, False: 75.2k] Branch (225:35): [True: 78, False: 75.1k] Branch (225:54): [True: 0, False: 75.1k]

226

            break;

227

        if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {

Branch (227:17): [True: 75.1k, False: 0] Branch (227:35): [True: 75.1k, False: 0] Branch (227:54): [True: 75.1k, False: 0]

228

            /* Stop checking coding spec after a line containing

229

             * anything except a comment. */

230

            tok->decoding_state = STATE_NORMAL;

231

            break;

232

        }

233

    }

234

    return 1;

235

}

236

tok->decoding_state = STATE_NORMAL;

237

if (tok->encoding == NULL) {

Branch (237:9): [True: 51, False: 26]

238

    assert(tok->decoding_readline == NULL);

239

    if (strcmp(cs, "utf-8") != 0 && 

!set_readline(tok, cs)43

) {

Branch (239:13): [True: 43, False: 8] Branch (239:41): [True: 0, False: 43]

240

        error_ret(tok);

241

        PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);

242

        PyMem_Free(cs);

243

        return 0;

244

    }

245

    tok->encoding = cs;

246

} else {                /* then, compare cs with BOM */

247

    if (strcmp(tok->encoding, cs) != 0) {

Branch (247:13): [True: 20, False: 6]

248

        error_ret(tok);

249

        PyErr_Format(PyExc_SyntaxError,

250

                     "encoding problem: %s with BOM", cs);

251

        PyMem_Free(cs);

252

        return 0;

253

    }

254

    PyMem_Free(cs);

255

}

256

return 1;

257

}

258

259

/* See whether the file starts with a BOM. If it does,

260

invoke the set_readline function with the new encoding.

261

Return 1 on success, 0 on failure. */

262

263

static int

264

check_bom(int get_char(struct tok_state *),

265

      void unget_char(int, struct tok_state *),

266

      int set_readline(struct tok_state *, const char *),

267

      struct tok_state *tok)

268

{

269

int ch1, ch2, ch3;

270

ch1 = get_char(tok);

271

tok->decoding_state = STATE_SEEK_CODING;

272

if (ch1 == EOF) {

Branch (272:9): [True: 0, False: 75.6k]

273

    return 1;

274

} else if (ch1 == 0xEF) {

Branch (274:16): [True: 37, False: 75.6k]

275

    ch2 = get_char(tok);

276

    if (ch2 != 0xBB) {

Branch (276:13): [True: 1, False: 36]

277

        unget_char(ch2, tok);

278

        unget_char(ch1, tok);

279

        return 1;

280

    }

281

    ch3 = get_char(tok);

282

    if (ch3 != 0xBF) {

Branch (282:13): [True: 2, False: 34]

283

        unget_char(ch3, tok);

284

        unget_char(ch2, tok);

285

        unget_char(ch1, tok);

286

        return 1;

287

    }

288

} else {

289

    unget_char(ch1, tok);

290

    return 1;

291

}

292

if (tok->encoding != NULL)

Branch (292:9): [True: 0, False: 34]

293

    PyMem_Free(tok->encoding);

294

tok->encoding = new_string("utf-8", 5, tok);

295

if (!tok->encoding)

Branch (295:9): [True: 0, False: 34]

296

    return 0;

297

/* No need to set_readline: input is already utf-8 */

298

return 1;

299

}

300

301

static int

302

tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {

303

assert(tok->fp_interactive);

304

305

if (!line) {

Branch (305:9): [True: 0, False: 0]

306

    return 0;

307

}

308

309

Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;

310

Py_ssize_t line_size = strlen(line);

311

char last_char = line[line_size > 0 ? line_size - 1 : line_size];

Branch (311:27): [True: 0, False: 0]

312

if (last_char != '\n') {

Branch (312:9): [True: 0, False: 0]

313

    line_size += 1;

314

}

315

char* new_str = tok->interactive_src_start;

316

317

new_str = PyMem_Realloc(new_str, current_size + line_size + 1);

318

if (!new_str) {

Branch (318:9): [True: 0, False: 0]

319

    if (tok->interactive_src_start) {

Branch (319:13): [True: 0, False: 0]

320

        PyMem_Free(tok->interactive_src_start);

321

    }

322

    tok->interactive_src_start = NULL;

323

    tok->interactive_src_end = NULL;

324

    tok->done = E_NOMEM;

325

    return -1;

326

}

327

strcpy(new_str + current_size, line);

328

if (last_char != '\n') {

Branch (328:9): [True: 0, False: 0]

329

    /* Last line does not end in \n, fake one */

330

    new_str[current_size + line_size - 1] = '\n';

331

    new_str[current_size + line_size] = '\0';

332

}

333

tok->interactive_src_start = new_str;

334

tok->interactive_src_end = new_str + current_size + line_size;

335

return 0;

336

}

337

338

339

/* Read a line of text from TOK into S, using the stream in TOK.

340

Return NULL on failure, else S.

341

342

On entry, tok->decoding_buffer will be one of:

343

 1) NULL: need to call tok->decoding_readline to get a new line

344

 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and

345

   stored the result in tok->decoding_buffer

346

 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room

347

   (in the s buffer) to copy entire contents of the line read

348

   by tok->decoding_readline.  tok->decoding_buffer has the overflow.

349

   In this case, tok_readline_recode is called in a loop (with an expanded buffer)

350

   until the buffer ends with a '\n' (or until the end of the file is

351

   reached): see tok_nextc and its calls to tok_reserve_buf.

352

*/

353

354

static int

355

tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)

356

{

357

Py_ssize_t cur = tok->cur - tok->buf;

358

Py_ssize_t oldsize = tok->inp - tok->buf;

359

Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);

360

if (newsize > tok->end - tok->buf) {

Branch (360:9): [True: 9, False: 801]

361

    char *newbuf = tok->buf;

362

    Py_ssize_t start = tok->start == NULL ? 

-11

:

tok->start - tok->buf8

;

Branch (362:28): [True: 1, False: 8]

363

    Py_ssize_t line_start = tok->start == NULL ? 

-11

:

tok->line_start - tok->buf8

;

Branch (363:33): [True: 1, False: 8]

364

    Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;

365

    newbuf = (char *)PyMem_Realloc(newbuf, newsize);

366

    if (newbuf == NULL) {

Branch (366:13): [True: 0, False: 9]

367

        tok->done = E_NOMEM;

368

        return 0;

369

    }

370

    tok->buf = newbuf;

371

    tok->cur = tok->buf + cur;

372

    tok->inp = tok->buf + oldsize;

373

    tok->end = tok->buf + newsize;

374

    tok->start = start < 0 ? NULL : 

tok->buf + start8

;

Branch (374:22): [True: 1, False: 8]

375

    tok->line_start = line_start < 0 ? NULL : 

tok->buf + line_start8

;

Branch (375:27): [True: 1, False: 8]

376

    tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;

Branch (376:33): [True: 0, False: 9]

377

}

378

return 1;

379

}

380

381

static int

382

tok_readline_recode(struct tok_state *tok) {

383

PyObject *line;

384

const  char *buf;

385

Py_ssize_t buflen;

386

line = tok->decoding_buffer;

387

if (line == NULL) {

Branch (387:9): [True: 0, False: 0]

388

    line = PyObject_CallNoArgs(tok->decoding_readline);

389

    if (line == NULL) {

Branch (389:13): [True: 0, False: 0]

390

        error_ret(tok);

391

        goto error;

392

    }

393

}

394

else {

395

    tok->decoding_buffer = NULL;

396

}

397

buf = PyUnicode_AsUTF8AndSize(line, &buflen);

398

if (buf == NULL) {

Branch (398:9): [True: 0, False: 0]

399

    error_ret(tok);

400

    goto error;

401

}

402

if (!tok_reserve_buf(tok, buflen + 1)) {

Branch (402:9): [True: 0, False: 0]

403

    goto error;

404

}

405

memcpy(tok->inp, buf, buflen);

406

tok->inp += buflen;

407

*tok->inp = '\0';

408

if (tok->fp_interactive &&

Branch (408:9): [True: 0, False: 0]

409

    tok_concatenate_interactive_new_line(tok, buf) == -1) {

Branch (409:9): [True: 0, False: 0]

410

    goto error;

411

}

412

Py_DECREF(line);

413

return 1;

414

error:

415

Py_XDECREF(line);

416

return 0;

417

}

418

419

/* Set the readline function for TOK to a StreamReader's

420

readline function. The StreamReader is named ENC.

421

422

This function is called from check_bom and check_coding_spec.

423

424

ENC is usually identical to the future value of tok->encoding,

425

except for the (currently unsupported) case of UTF-16.

426

427

Return 1 on success, 0 on failure. */

428

429

static int

430

fp_setreadl(struct tok_state tok, const char enc)

431

{

432

PyObject *readline, *open, *stream;

433

int fd;

434

long pos;

435

436

fd = fileno(tok->fp);

437

/* Due to buffering the file offset for fd can be different from the file

438

 * position of tok->fp.  If tok->fp was opened in text mode on Windows,

439

 * its file position counts CRLF as one char and can't be directly mapped

440

 * to the file offset for fd.  Instead we step back one byte and read to

441

 * the end of line.*/

442

pos = ftell(tok->fp);

443

if (pos == -1 ||

Branch (443:9): [True: 0, False: 0]

444

    lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {

Branch (444:9): [True: 0, False: 0] Branch (444:27): [True: 0, False: 0]

445

    PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);

446

    return 0;

447

}

448

449

open = _PyImport_GetModuleAttrString("io", "open");

450

if (open == NULL) {

Branch (450:9): [True: 0, False: 0]

451

    return 0;

452

}

453

stream = PyObject_CallFunction(open, "isisOOO",

454

                fd, "r", -1, enc, Py_None, Py_None, Py_False);

455

Py_DECREF(open);

456

if (stream == NULL) {

Branch (456:9): [True: 0, False: 0]

457

    return 0;

458

}

459

460

readline = PyObject_GetAttr(stream, &_Py_ID(readline));

461

Py_DECREF(stream);

462

if (readline == NULL) {

Branch (462:9): [True: 0, False: 0]

463

    return 0;

464

}

465

Py_XSETREF(tok->decoding_readline, readline);

466

467

if (pos > 0) {

Branch (467:9): [True: 0, False: 0]

468

    PyObject *bufobj = _PyObject_CallNoArgs(readline);

469

    if (bufobj == NULL) {

Branch (469:13): [True: 0, False: 0]

470

        return 0;

471

    }

472

    Py_DECREF(bufobj);

473

}

474

475

return 1;

476

}

477

478

/* Fetch the next byte from TOK. */

479

480

static int fp_getc(struct tok_state *tok) {

481

return getc(tok->fp);

482

}

483

484

/* Unfetch the last byte back into TOK. */

485

486

static void fp_ungetc(int c, struct tok_state *tok) {

487

ungetc(c, tok->fp);

488

}

489

490

/* Check whether the characters at s start a valid

491

UTF-8 sequence. Return the number of characters forming

492

the sequence if yes, 0 if not. */

493

static int valid_utf8(const unsigned char* s)

494

{

495

int expected = 0;

496

int length;

497

if (*s < 0x80)

Branch (497:9): [True: 18.2k, False: 0]

498

    /* single-byte code */

499

    return 1;

500

if (*s < 0xc0)

Branch (500:9): [True: 0, False: 0]

501

    /* following byte */

502

    return 0;

503

if (*s < 0xE0)

Branch (503:9): [True: 0, False: 0]

504

    expected = 1;

505

else if (*s < 0xF0)

Branch (505:14): [True: 0, False: 0]

506

    expected = 2;

507

else if (*s < 0xF8)

Branch (507:14): [True: 0, False: 0]

508

    expected = 3;

509

else

510

    return 0;

511

length = expected + 1;

512

for (; expected; expected--)

Branch (512:12): [True: 0, False: 0]

513

    if (s[expected] < 0x80 || s[expected] >= 0xC0)

Branch (513:13): [True: 0, False: 0] Branch (513:35): [True: 0, False: 0]

514

        return 0;

515

return length;

516

}

517

518

static int

519

ensure_utf8(char *line, struct tok_state *tok)

520

{

521

int badchar = 0;

522

unsigned char *c;

523

int length;

524

for (c = (unsigned char *)line; *c; 

c += length18.2k

) {

Branch (524:37): [True: 18.2k, False: 806]

525

    if (!(length = valid_utf8(c))) {

Branch (525:13): [True: 0, False: 18.2k]

526

        badchar = *c;

527

        break;

528

    }

529

}

530

if (badchar) {

Branch (530:9): [True: 0, False: 806]

531

    /* Need to add 1 to the line number, since this line

532

   has not been counted, yet.  */

533

    PyErr_Format(PyExc_SyntaxError,

534

                 "Non-UTF-8 code starting with '\\x%.2x' "

535

                 "in file %U on line %i, "

536

                 "but no encoding declared; "

537

                 "see https://peps.python.org/pep-0263/ for details",

538

                 badchar, tok->filename, tok->lineno + 1);

539

    return 0;

540

}

541

return 1;

542

}

543

544

/* Fetch a byte from TOK, using the string buffer. */

545

546

static int

547

buf_getc(struct tok_state *tok) {

548

return Py_CHARMASK(*tok->str++);

549

}

550

551

/* Unfetch a byte from TOK, using the string buffer. */

552

553

static void

554

buf_ungetc(int c, struct tok_state *tok) {

555

tok->str--;

556

assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */

557

}

558

559

/* Set the readline function for TOK to ENC. For the string-based

560

tokenizer, this means to just record the encoding. */

561

562

static int

563

buf_setreadl(struct tok_state tok, const char enc) {

564

tok->enc = enc;

565

return 1;

566

}

567

568

/* Return a UTF-8 encoding Python string object from the

569

C byte string STR, which is encoded with ENC. */

570

571

static PyObject *

572

translate_into_utf8(const char* str, const char* enc) {

573

PyObject *utf8;

574

PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);

575

if (buf == NULL)

Branch (575:9): [True: 6, False: 37]

576

    return NULL;

577

utf8 = PyUnicode_AsUTF8String(buf);

578

Py_DECREF(buf);

579

return utf8;

580

}

581

582

583

static char *

584

translate_newlines(const char *s, int exec_input, struct tok_state *tok) {

585

int skip_next_lf = 0;

586

size_t needed_length = strlen(s) + 2, final_length;

587

char *buf, *current;

588

char c = '\0';

589

buf = PyMem_Malloc(needed_length);

590

if (buf == NULL) {

Branch (590:9): [True: 0, False: 132k]

591

    tok->done = E_NOMEM;

592

    return NULL;

593

}

594

for (current = buf; 132k

*s;

s++, current++27.2M

) {

Branch (594:25): [True: 27.2M, False: 132k]

595

    c = *s;

596

    if (skip_next_lf) {

Branch (596:13): [True: 40, False: 27.2M]

597

        skip_next_lf = 0;

598

        if (c == '\n') {

Branch (598:17): [True: 23, False: 17]

599

            c = *++s;

600

            if (!c)

Branch (600:21): [True: 7, False: 16]

601

                break;

602

        }

603

    }

604

    if (c == '\r') {

Branch (604:13): [True: 47, False: 27.2M]

605

        skip_next_lf = 1;

606

        c = '\n';

607

    }

608

    *current = c;

609

}

610

/* If this is exec input, add a newline to the end of the string if

611

   there isn't one already. */

612

if (exec_input && 

c != '\n'90.7k

) {

Branch (612:9): [True: 90.7k, False: 41.6k] Branch (612:23): [True: 86.4k, False: 4.29k]

613

    *current = '\n';

614

    current++;

615

}

616

*current = '\0';

617

final_length = current - buf + 1;

618

if (final_length < needed_length && 

final_length45.9k

) {

Branch (618:9): [True: 45.9k, False: 86.4k] Branch (618:41): [True: 45.9k, False: 0]

619

    /* should never fail */

620

    char* result = PyMem_Realloc(buf, final_length);

621

    if (result == NULL) {

Branch (621:13): [True: 0, False: 45.9k]

622

        PyMem_Free(buf);

623

    }

624

    buf = result;

625

}

626

return buf;

627

}

628

629

/* Decode a byte string STR for use as the buffer of TOK.

630

Look for encoding declarations inside STR, and record them

631

inside TOK. */

632

633

static char *

634

decode_str(const char *input, int single, struct tok_state *tok)

635

{

636

PyObject* utf8 = NULL;

637

char *str;

638

const char *s;

639

const char *newl[2] = {NULL, NULL};

640

int lineno = 0;

641

tok->input = str = translate_newlines(input, single, tok);

642

if (str == NULL)

Branch (642:9): [True: 0, False: 75.4k]

643

    return NULL;

644

tok->enc = NULL;

645

tok->str = str;

646

if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))

Branch (646:9): [True: 0, False: 75.4k]

647

    return error_ret(tok);

648

str = tok->str;             /* string after BOM if any */

649

assert(str);

650

if (tok->enc != NULL) {

Branch (650:9): [True: 0, False: 75.4k]

651

    utf8 = translate_into_utf8(str, tok->enc);

652

    if (utf8 == NULL)

Branch (652:13): [True: 0, False: 0]

653

        return error_ret(tok);

654

    str = PyBytes_AsString(utf8);

655

}

656

for (s = str;; 75.4k

s++846k

) {

657

    if (*s == '\0') 

break74.2k

;

Branch (657:13): [True: 74.2k, False: 847k]

658

    else if (*s == '\n') {

Branch (658:18): [True: 76.5k, False: 770k]

659

        assert(lineno < 2);

660

        newl[lineno] = s;

661

        lineno++;

662

        if (lineno == 2) 

break1.18k

;

Branch (662:17): [True: 1.18k, False: 75.4k]

663

    }

664

}

665

tok->enc = NULL;

666

/* need to check line 1 and 2 separately since check_coding_spec

667

   assumes a single line as input */

668

if (newl[0]) {

Branch (668:9): [True: 75.4k, False: 13]

669

    if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {

Branch (669:13): [True: 20, False: 75.3k]

670

        return NULL;

671

    }

672

    if (tok->enc == NULL && 

tok->decoding_state != STATE_NORMAL75.3k

&&

newl[1]481

) {

Branch (672:13): [True: 75.3k, False: 36] Branch (672:33): [True: 481, False: 74.8k] Branch (672:72): [True: 224, False: 257]

673

        if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],

Branch (673:17): [True: 0, False: 224]

674

                               tok, buf_setreadl))

675

            return NULL;

676

    }

677

}

678

if (tok->enc != NULL) {

Branch (678:9): [True: 43, False: 75.3k]

679

    assert(utf8 == NULL);

680

    utf8 = translate_into_utf8(str, tok->enc);

681

    if (utf8 == NULL)

Branch (681:13): [True: 6, False: 37]

682

        return error_ret(tok);

683

    str = PyBytes_AS_STRING(utf8);

684

}

685

assert(tok->decoding_buffer == NULL);

686

tok->decoding_buffer = utf8; /* CAUTION */

687

return str;

688

}

689

690

/* Set up tokenizer for string */

691

692

struct tok_state *

693

_PyTokenizer_FromString(const char *str, int exec_input)

694

{

695

struct tok_state *tok = tok_new();

696

char *decoded;

697

698

if (tok == NULL)

Branch (698:9): [True: 0, False: 75.4k]

699

    return NULL;

700

decoded = decode_str(str, exec_input, tok);

701

if (decoded == NULL) {

Branch (701:9): [True: 26, False: 75.4k]

702

    _PyTokenizer_Free(tok);

703

    return NULL;

704

}

705

706

tok->buf = tok->cur = tok->inp = decoded;

707

tok->end = decoded;

708

return tok;

709

}

710

711

/* Set up tokenizer for UTF-8 string */

712

713

struct tok_state *

714

_PyTokenizer_FromUTF8(const char *str, int exec_input)

715

{

716

struct tok_state *tok = tok_new();

717

char *translated;

718

if (tok == NULL)

Branch (718:9): [True: 0, False: 56.9k]

719

    return NULL;

720

tok->input = translated = translate_newlines(str, exec_input, tok);

721

if (translated == NULL) {

Branch (721:9): [True: 0, False: 56.9k]

722

    _PyTokenizer_Free(tok);

723

    return NULL;

724

}

725

tok->decoding_state = STATE_NORMAL;

726

tok->enc = NULL;

727

tok->str = translated;

728

tok->encoding = new_string("utf-8", 5, tok);

729

if (!tok->encoding) {

Branch (729:9): [True: 0, False: 56.9k]

730

    _PyTokenizer_Free(tok);

731

    return NULL;

732

}

733

734

tok->buf = tok->cur = tok->inp = translated;

735

tok->end = translated;

736

return tok;

737

}

738

739

/* Set up tokenizer for file */

740

741

struct tok_state *

742

_PyTokenizer_FromFile(FILE fp, const char enc,

743

                  const char *ps1, const char *ps2)

744

{

745

struct tok_state *tok = tok_new();

746

if (tok == NULL)

Branch (746:9): [True: 0, False: 257]

747

    return NULL;

748

if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {

Branch (748:9): [True: 0, False: 257]

749

    _PyTokenizer_Free(tok);

750

    return NULL;

751

}

752

tok->cur = tok->inp = tok->buf;

753

tok->end = tok->buf + BUFSIZ;

754

tok->fp = fp;

755

tok->prompt = ps1;

756

tok->nextprompt = ps2;

757

if (enc != NULL) {

Branch (757:9): [True: 0, False: 257]

758

    /* Must copy encoding declaration since it

759

       gets copied into the parse tree. */

760

    tok->encoding = new_string(enc, strlen(enc), tok);

761

    if (!tok->encoding) {

Branch (761:13): [True: 0, False: 0]

762

        _PyTokenizer_Free(tok);

763

        return NULL;

764

    }

765

    tok->decoding_state = STATE_NORMAL;

766

}

767

return tok;

768

}

769

770

/* Free a tok_state structure */

771

772

void

773

_PyTokenizer_Free(struct tok_state *tok)

774

{

775

if (tok->encoding != NULL) {

Branch (775:9): [True: 57.0k, False: 75.5k]

776

    PyMem_Free(tok->encoding);

777

}

778

Py_XDECREF(tok->decoding_readline);

779

Py_XDECREF(tok->decoding_buffer);

780

Py_XDECREF(tok->filename);

781

if (tok->fp != NULL && 

tok->buf != NULL257

) {

Branch (781:9): [True: 257, False: 132k] Branch (781:28): [True: 257, False: 0]

782

    PyMem_Free(tok->buf);

783

}

784

if (tok->input) {

Branch (784:9): [True: 132k, False: 257]

785

    PyMem_Free(tok->input);

786

}

787

if (tok->interactive_src_start != NULL) {

Branch (787:9): [True: 0, False: 132k]

788

    PyMem_Free(tok->interactive_src_start);

789

}

790

PyMem_Free(tok);

791

}

792

793

static int

794

tok_readline_raw(struct tok_state *tok)

795

{

796

do {

797

    if (!tok_reserve_buf(tok, BUFSIZ)) {

Branch (797:13): [True: 0, False: 810]

798

        return 0;

799

    }

800

    char *line = Py_UniversalNewlineFgets(tok->inp,

801

                                          (int)(tok->end - tok->inp),

802

                                          tok->fp, NULL);

803

    if (line == NULL) {

Branch (803:13): [True: 4, False: 806]

804

        return 1;

805

    }

806

    if (tok->fp_interactive &&

Branch (806:13): [True: 0, False: 806]

807

tok_concatenate_interactive_new_line(tok, line) == -10

) {

Branch (807:13): [True: 0, False: 0]

808

        return 0;

809

    }

810

    tok->inp = strchr(tok->inp, '\0');

811

    if (tok->inp == tok->buf) {

Branch (811:13): [True: 0, False: 806]

812

        return 0;

813

    }

814

} while (tok->inp[-1] != '\n');

Branch (814:14): [True: 1, False: 805]

815

return 1;

816

}

817

818

static int

819

tok_underflow_string(struct tok_state *tok) {

820

char *end = strchr(tok->inp, '\n');

821

if (end != NULL) {

Branch (821:9): [True: 3.37M, False: 92.9k]

822

    end++;

823

}

824

else {

825

    end = strchr(tok->inp, '\0');

826

    if (end == tok->inp) {

Branch (826:13): [True: 55.7k, False: 37.1k]

827

        tok->done = E_EOF;

828

        return 0;

829

    }

830

}

831

if (tok->start == NULL) {

Branch (831:9): [True: 3.34M, False: 64.1k]

832

    tok->buf = tok->cur;

833

}

834

tok->line_start = tok->cur;

835

tok->lineno++;

836

tok->inp = end;

837

return 1;

838

}

839

840

static int

841

tok_underflow_interactive(struct tok_state *tok) {

842

if (tok->interactive_underflow == IUNDERFLOW_STOP) {

Branch (842:9): [True: 0, False: 0]

843

    tok->done = E_INTERACT_STOP;

844

    return 1;

845

}

846

char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);

Branch (846:34): [True: 0, False: 0]

847

if (newtok != NULL) {

Branch (847:9): [True: 0, False: 0]

848

    char *translated = translate_newlines(newtok, 0, tok);

849

    PyMem_Free(newtok);

850

    if (translated == NULL) {

Branch (850:13): [True: 0, False: 0]

851

        return 0;

852

    }

853

    newtok = translated;

854

}

855

if (tok->encoding && newtok && *newtok) {

Branch (855:9): [True: 0, False: 0] Branch (855:26): [True: 0, False: 0] Branch (855:36): [True: 0, False: 0]

856

    /* Recode to UTF-8 */

857

    Py_ssize_t buflen;

858

    const char* buf;

859

    PyObject *u = translate_into_utf8(newtok, tok->encoding);

860

    PyMem_Free(newtok);

861

    if (u == NULL) {

Branch (861:13): [True: 0, False: 0]

862

        tok->done = E_DECODE;

863

        return 0;

864

    }

865

    buflen = PyBytes_GET_SIZE(u);

866

    buf = PyBytes_AS_STRING(u);

867

    newtok = PyMem_Malloc(buflen+1);

868

    if (newtok == NULL) {

Branch (868:13): [True: 0, False: 0]

869

        Py_DECREF(u);

870

        tok->done = E_NOMEM;

871

        return 0;

872

    }

873

    strcpy(newtok, buf);

874

    Py_DECREF(u);

875

}

876

if (tok->fp_interactive &&

Branch (876:9): [True: 0, False: 0]

877

    tok_concatenate_interactive_new_line(tok, newtok) == -1) {

Branch (877:9): [True: 0, False: 0]

878

    PyMem_Free(newtok);

879

    return 0;

880

}

881

if (tok->nextprompt != NULL) {

Branch (881:9): [True: 0, False: 0]

882

    tok->prompt = tok->nextprompt;

883

}

884

if (newtok == NULL) {

Branch (884:9): [True: 0, False: 0]

885

    tok->done = E_INTR;

886

}

887

else if (*newtok == '\0') {

Branch (887:14): [True: 0, False: 0]

888

    PyMem_Free(newtok);

889

    tok->done = E_EOF;

890

}

891

else if (tok->start != NULL) {

Branch (891:14): [True: 0, False: 0]

892

    Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;

893

    size_t size = strlen(newtok);

894

    tok->lineno++;

895

    if (!tok_reserve_buf(tok, size + 1)) {

Branch (895:13): [True: 0, False: 0]

896

        PyMem_Free(tok->buf);

897

        tok->buf = NULL;

898

        PyMem_Free(newtok);

899

        return 0;

900

    }

901

    memcpy(tok->cur, newtok, size + 1);

902

    PyMem_Free(newtok);

903

    tok->inp += size;

904

    tok->multi_line_start = tok->buf + cur_multi_line_start;

905

}

906

else {

907

    tok->lineno++;

908

    PyMem_Free(tok->buf);

909

    tok->buf = newtok;

910

    tok->cur = tok->buf;

911

    tok->line_start = tok->buf;

912

    tok->inp = strchr(tok->buf, '\0');

913

    tok->end = tok->inp + 1;

914

}

915

if (tok->done != E_OK) {

Branch (915:9): [True: 0, False: 0]

916

    if (tok->prompt != NULL) {

Branch (916:13): [True: 0, False: 0]

917

        PySys_WriteStderr("\n");

918

    }

919

    return 0;

920

}

921

return 1;

922

}

923

924

static int

925

tok_underflow_file(struct tok_state *tok) {

926

if (tok->start == NULL) {

Branch (926:9): [True: 801, False: 8]

927

    tok->cur = tok->inp = tok->buf;

928

}

929

if (tok->decoding_state == STATE_INIT) {

Branch (929:9): [True: 257, False: 552]

930

    /* We have not yet determined the encoding.

931

       If an encoding is found, use the file-pointer

932

       reader functions from now on. */

933

    if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {

Branch (933:13): [True: 0, False: 257]

934

        error_ret(tok);

935

        return 0;

936

    }

937

    assert(tok->decoding_state != STATE_INIT);

938

}

939

/* Read until '\n' or EOF */

940

if (tok->decoding_readline != NULL) {

Branch (940:9): [True: 0, False: 809]

941

    /* We already have a codec associated with this input. */

942

    if (!tok_readline_recode(tok)) {

Branch (942:13): [True: 0, False: 0]

943

        return 0;

944

    }

945

}

946

else {

947

    /* We want a 'raw' read. */

948

    if (!tok_readline_raw(tok)) {

Branch (948:13): [True: 0, False: 809]

949

        return 0;

950

    }

951

}

952

if (tok->inp == tok->cur) {

Branch (952:9): [True: 3, False: 806]

953

    tok->done = E_EOF;

954

    return 0;

955

}

956

if (tok->inp[-1] != '\n') {

Branch (956:9): [True: 1, False: 805]

957

    /* Last line does not end in \n, fake one */

958

    *tok->inp++ = '\n';

959

    *tok->inp = '\0';

960

}

961

962

tok->lineno++;

963

if (tok->decoding_state != STATE_NORMAL) {

Branch (963:9): [True: 425, False: 381]

964

    if (tok->lineno > 2) {

Branch (964:13): [True: 84, False: 341]

965

        tok->decoding_state = STATE_NORMAL;

966

    }

967

    else if (!check_coding_spec(tok->cur, strlen(tok->cur),

Branch (967:18): [True: 0, False: 341]

968

                                tok, fp_setreadl))

969

    {

970

        return 0;

971

    }

972

}

973

/* The default encoding is UTF-8, so make sure we don't have any

974

   non-UTF-8 sequences in it. */

975

if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {

Branch (975:9): [True: 806, False: 0] Branch (975:27): [True: 0, False: 806]

976

    error_ret(tok);

977

    return 0;

978

}

979

assert(tok->done == E_OK);

980

return tok->done == E_OK;

981

}

982

983

#if defined(Py_DEBUG)

984

static void

985

print_escape(FILE *f, const char *s, Py_ssize_t size)

986

{

987

if (s == NULL) {

988

    fputs("NULL", f);

989

    return;

990

}

991

putc('"', f);

992

while (size-- > 0) {

993

    unsigned char c = *s++;

994

    switch (c) {

995

        case '\n': fputs("\\n", f); break;

996

        case '\r': fputs("\\r", f); break;

997

        case '\t': fputs("\\t", f); break;

998

        case '\f': fputs("\\f", f); break;

999

        case '\'': fputs("\\'", f); break;

1000

        case '"': fputs("\\\"", f); break;

1001

        default:

1002

            if (0x20 <= c && c <= 0x7f)

1003

                putc(c, f);

1004

            else

1005

                fprintf(f, "\\x%02x", c);

1006

    }

1007

}

1008

putc('"', f);

1009

}

1010

#endif

1011

1012

/* Get next char, updating state; error code goes into tok->done */

1013

1014

static int

1015

tok_nextc(struct tok_state *tok)

1016

{

1017

int rc;

1018

for (;;) {

1019

    if (tok->cur != tok->inp) {

Branch (1019:13): [True: 47.0M, False: 3.57M]

1020

        return Py_CHARMASK(*tok->cur++); /* Fast path */

1021

    }

1022

    if (tok->done != E_OK) {

Branch (1022:13): [True: 107k, False: 3.46M]

1023

       return EOF;

1024

    }

1025

    if (tok->fp == NULL) {

Branch (1025:13): [True: 3.46M, False: 809]

1026

        rc = tok_underflow_string(tok);

1027

    }

1028

    else if (tok->prompt != NULL) {

Branch (1028:18): [True: 0, False: 809]

1029

        rc = tok_underflow_interactive(tok);

1030

    }

1031

    else {

1032

        rc = tok_underflow_file(tok);

1033

    }

1034

#if defined(Py_DEBUG)

1035

    if (tok->debug) {

1036

        fprintf(stderr, "line[%d] = ", tok->lineno);

1037

        print_escape(stderr, tok->cur, tok->inp - tok->cur);

1038

        fprintf(stderr, "  tok->done = %d\n", tok->done);

1039

    }

1040

#endif

1041

    if (!rc) {

Branch (1041:13): [True: 55.7k, False: 3.41M]

1042

        tok->cur = tok->inp;

1043

        return EOF;

1044

    }

1045

    tok->line_start = tok->cur;

1046

}

1047

Py_UNREACHABLE0

();

1048

}

1049

1050

/* Back-up one character */

1051

1052

static void

1053

tok_backup(struct tok_state *tok, int c)

1054

{

1055

if (c != EOF) {

Branch (1055:9): [True: 19.9M, False: 107k]

1056

    if (--tok->cur < tok->buf) {

Branch (1056:13): [True: 0, False: 19.9M]

1057

        Py_FatalError("tokenizer beginning of buffer");

1058

    }

1059

    if ((int)(unsigned char)*tok->cur != c) {

Branch (1059:13): [True: 0, False: 19.9M]

1060

        Py_FatalError("tok_backup: wrong character");

1061

    }

1062

}

1063

}

1064

1065

static int

1066

_syntaxerror_range(struct tok_state *tok, const char *format,

1067

               int col_offset, int end_col_offset,

1068

               va_list vargs)

1069

{

1070

PyObject *errmsg, *errtext, *args;

1071

errmsg = PyUnicode_FromFormatV(format, vargs);

1072

if (!errmsg) {

Branch (1072:9): [True: 0, False: 228]

1073

    goto error;

1074

}

1075

1076

errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,

1077

                               "replace");

1078

if (!errtext) {

Branch (1078:9): [True: 0, False: 228]

1079

    goto error;

1080

}

1081

1082

if (col_offset == -1) {

Branch (1082:9): [True: 215, False: 13]

1083

    col_offset = (int)PyUnicode_GET_LENGTH(errtext);

1084

}

1085

if (end_col_offset == -1) {

Branch (1085:9): [True: 215, False: 13]

1086

    end_col_offset = col_offset;

1087

}

1088

1089

Py_ssize_t line_len = strcspn(tok->line_start, "\n");

1090

if (line_len != tok->cur - tok->line_start) {

Branch (1090:9): [True: 153, False: 75]

1091

    Py_DECREF(errtext);

1092

    errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,

1093

                                   "replace");

1094

}

1095

if (!errtext) {

Branch (1095:9): [True: 0, False: 228]

1096

    goto error;

1097

}

1098

1099

args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,

1100

                     col_offset, errtext, tok->lineno, end_col_offset);

1101

if (args) {

Branch (1101:9): [True: 228, False: 0]

1102

    PyErr_SetObject(PyExc_SyntaxError, args);

1103

    Py_DECREF(args);

1104

}

1105

1106

error:

1107

Py_XDECREF(errmsg);

1108

tok->done = E_ERROR;

1109

return ERRORTOKEN;

1110

}

1111

1112

static int

1113

syntaxerror(struct tok_state *tok, const char *format, ...)

1114

{

1115

va_list vargs;

1116

va_start(vargs, format);

1117

int ret = _syntaxerror_range(tok, format, -1, -1, vargs);

1118

va_end(vargs);

1119

return ret;

1120

}

1121

1122

static int

1123

syntaxerror_known_range(struct tok_state *tok,

1124

                    int col_offset, int end_col_offset,

1125

                    const char *format, ...)

1126

{

1127

va_list vargs;

1128

va_start(vargs, format);

1129

int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);

1130

va_end(vargs);

1131

return ret;

1132

}

1133

1134

1135

1136

static int

1137

indenterror(struct tok_state *tok)

1138

{

1139

tok->done = E_TABSPACE;

1140

tok->cur = tok->inp;

1141

return ERRORTOKEN;

1142

}

1143

1144

static int

1145

parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)

1146

{

1147

PyObject *errmsg;

1148

va_list vargs;

1149

va_start(vargs, format);

1150

errmsg = PyUnicode_FromFormatV(format, vargs);

1151

va_end(vargs);

1152

if (!errmsg) {

Branch (1152:9): [True: 0, False: 130]

1153

    goto error;

1154

}

1155

1156

if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,

Branch (1156:9): [True: 64, False: 66]

1157

                             tok->lineno, NULL, NULL) < 0) {

1158

    if (PyErr_ExceptionMatches(category)) {

Branch (1158:13): [True: 64, False: 0]

1159

        /* Replace the DeprecationWarning exception with a SyntaxError

1160

           to get a more accurate error report */

1161

        PyErr_Clear();

1162

        syntaxerror(tok, "%U", errmsg);

1163

    }

1164

    goto error;

1165

}

1166

Py_DECREF(errmsg);

1167

return 0;

1168

1169

error:

1170

Py_XDECREF(errmsg);

1171

tok->done = E_ERROR;

1172

return -1;

1173

}

1174

1175

static int

1176

lookahead(struct tok_state *tok, const char *test)

1177

{

1178

const char *s = test;

1179

int res = 0;

1180

while (1) {

Branch (1180:12): [Folded - Ignored]

1181

    int c = tok_nextc(tok);

1182

    if (*s == 0) {

Branch (1182:13): [True: 82, False: 167]

1183

        res = !is_potential_identifier_char(c);

1184

    }

1185

    else if (c == *s) {

Branch (1185:18): [True: 158, False: 9]

1186

        s++;

1187

        continue;

1188

    }

1189

1190

    tok_backup(tok, c);

1191

    while (s != test) {

Branch (1191:16): [True: 158, False: 91]

1192

        tok_backup(tok, *--s);

1193

    }

1194

    return res;

1195

}

1196

}

1197

1198

static int

1199

verify_end_of_number(struct tok_state *tok, int c, const char *kind)

1200

{

1201

/* Emit a deprecation warning only if the numeric literal is immediately

1202

 * followed by one of keywords which can occur after a numeric literal

1203

 * in valid code: "and", "else", "for", "if", "in", "is" and "or".

1204

 * It allows to gradually deprecate existing valid code without adding

1205

 * warning before error in most cases of invalid numeric literal (which

1206

 * would be confusing and break existing tests).

1207

 * Raise a syntax error with slightly better message than plain

1208

 * "invalid syntax" if the numeric literal is immediately followed by

1209

 * other keyword or identifier.

1210

 */

1211

int r = 0;

1212

if (c == 'a') {

Branch (1212:9): [True: 14, False: 646k]

1213

    r = lookahead(tok, "nd");

1214

}

1215

else if (c == 'e') {

Branch (1215:14): [True: 24, False: 646k]

1216

    r = lookahead(tok, "lse");

1217

}

1218

else if (c == 'f') {

Branch (1218:14): [True: 14, False: 646k]

1219

    r = lookahead(tok, "or");

1220

}

1221

else if (c == 'i') {

Branch (1221:14): [True: 48, False: 646k]

1222

    int c2 = tok_nextc(tok);

1223

    if (c2 == 'f' || 

c2 == 'n'32

||

c2 == 's'16

) {

Branch (1223:13): [True: 16, False: 32] Branch (1223:26): [True: 16, False: 16] Branch (1223:39): [True: 16, False: 0]

1224

        r = 1;

1225

    }

1226

    tok_backup(tok, c2);

1227

}

1228

else if (c == 'o') {

Branch (1228:14): [True: 22, False: 646k]

1229

    r = lookahead(tok, "r");

1230

}

1231

else if (c == 'n') {

Branch (1231:14): [True: 17, False: 646k]

1232

    r = lookahead(tok, "ot");

1233

}

1234

if (r) {

Branch (1234:9): [True: 130, False: 646k]

1235

    tok_backup(tok, c);

1236

    if (parser_warn(tok, PyExc_SyntaxWarning,

Branch (1236:13): [True: 64, False: 66]

1237

            "invalid %s literal", kind))

1238

    {

1239

        return 0;

1240

    }

1241

    tok_nextc(tok);

1242

}

1243

else /* In future releases, only error will remain. */

1244

if (is_potential_identifier_char(c)) {

1245

    tok_backup(tok, c);

1246

    syntaxerror(tok, "invalid %s literal", kind);

1247

    return 0;

1248

}

1249

return 1;

1250

}

1251

1252

/* Verify that the identifier follows PEP 3131.

1253

All identifier strings are guaranteed to be "ready" unicode objects.

1254

*/

1255

static int

1256

verify_identifier(struct tok_state *tok)

1257

{

1258

PyObject *s;

1259

if (tok->decoding_erred)

Branch (1259:9): [True: 0, False: 48]

1260

    return 0;

1261

s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);

1262

if (s == NULL) {

Branch (1262:9): [True: 4, False: 44]

1263

    if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {

Branch (1263:13): [True: 4, False: 0]

1264

        tok->done = E_DECODE;

1265

    }

1266

    else {

1267

        tok->done = E_ERROR;

1268

    }

1269

    return 0;

1270

}

1271

Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);

1272

if (invalid < 0) {

Branch (1272:9): [True: 0, False: 44]

1273

    Py_DECREF(s);

1274

    tok->done = E_ERROR;

1275

    return 0;

1276

}

1277

assert(PyUnicode_GET_LENGTH(s) > 0);

1278

if (invalid < PyUnicode_GET_LENGTH(s)) {

Branch (1278:9): [True: 8, False: 36]

1279

    Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);

1280

    if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {

Branch (1280:13): [True: 0, False: 8]

1281

        /* Determine the offset in UTF-8 encoded input */

1282

        Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));

1283

        if (s != NULL) {

Branch (1283:17): [True: 0, False: 0]

1284

            Py_SETREF(s, PyUnicode_AsUTF8String(s));

1285

        }

1286

        if (s == NULL) {

Branch (1286:17): [True: 0, False: 0]

1287

            tok->done = E_ERROR;

1288

            return 0;

1289

        }

1290

        tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);

1291

    }

1292

    Py_DECREF(s);

1293

    // PyUnicode_FromFormatV() does not support %X

1294

    char hex[9];

1295

    (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);

1296

    if (Py_UNICODE_ISPRINTABLE(ch)) {

1297

        syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);

1298

    }

1299

    else {

1300

        syntaxerror(tok, "invalid non-printable character U+%s", hex);

1301

    }

1302

    return 0;

1303

}

1304

Py_DECREF(s);

1305

return 1;

1306

}

1307

1308

static int

1309

tok_decimal_tail(struct tok_state *tok)

1310

{

1311

int c;

1312

1313

while (1) {

Branch (1313:12): [Folded - Ignored]

1314

    do {

1315

        c = tok_nextc(tok);

1316

    } while (isdigit(c));

1317

    if (c != '_') {

Branch (1317:13): [True: 330k, False: 262]

1318

        break;

1319

    }

1320

    c = tok_nextc(tok);

1321

    if (!isdigit(c)) {

Branch (1321:13): [True: 27, False: 235]

1322

        tok_backup(tok, c);

1323

        syntaxerror(tok, "invalid decimal literal");

1324

        return 0;

1325

    }

1326

}

1327

return c;

1328

}

1329

1330

/* Get next token, after space stripping etc. */

1331

1332

static inline int

1333

tok_continuation_line(struct tok_state *tok) {

1334

int c = tok_nextc(tok);

1335

if (c != '\n') {

Branch (1335:9): [True: 7, False: 486]

1336

    tok->done = E_LINECONT;

1337

    return -1;

1338

}

1339

c = tok_nextc(tok);

1340

if (c == EOF) {

Branch (1340:9): [True: 9, False: 477]

1341

    tok->done = E_EOF;

1342

    tok->cur = tok->inp;

1343

    return -1;

1344

} else {

1345

    tok_backup(tok, c);

1346

}

1347

return c;

1348

}

1349

1350

static int

1351

tok_get(struct tok_state *tok, const char **p_start, const char **p_end)

1352

{

1353

int c;

1354

int blankline, nonascii;

1355

1356

*p_start = *p_end = NULL;

1357

nextline:

1358

tok->start = NULL;

1359

blankline = 0;

1360

1361

/* Get indentation level */

1362

if (tok->atbol) {

Branch (1362:9): [True: 3.36M, False: 6.82M]

1363

    int col = 0;

1364

    int altcol = 0;

1365

    tok->atbol = 0;

1366

    int cont_line_col = 0;

1367

    for (;;) {

1368

        c = tok_nextc(tok);

1369

        if (c == ' ') {

Branch (1369:17): [True: 4.90M, False: 3.36M]

1370

            col++, altcol++;

1371

        }

1372

        else if (c == '\t') {

Branch (1372:22): [True: 907, False: 3.36M]

1373

            col = (col / tok->tabsize + 1) * tok->tabsize;

1374

            altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;

1375

        }

1376

        else if (c == '\014')  {/* Control-L (formfeed) */

Branch (1376:22): [True: 7, False: 3.36M]

1377

            col = altcol = 0; /* For Emacs users */

1378

        }

1379

        else if (c == '\\') {

Branch (1379:22): [True: 30, False: 3.36M]

1380

            // Indentation cannot be split over multiple physical lines

1381

            // using backslashes. This means that if we found a backslash

1382

            // preceded by whitespace, **the first one we find** determines

1383

            // the level of indentation of whatever comes next.

1384

            cont_line_col = cont_line_col ? 

cont_line_col6

:

col24

;

Branch (1384:33): [True: 6, False: 24]

1385

            if ((c = tok_continuation_line(tok)) == -1) {

Branch (1385:21): [True: 1, False: 29]

1386

                return ERRORTOKEN;

1387

            }

1388

        }

1389

        else {

1390

            break;

1391

        }

1392

    }

1393

    tok_backup(tok, c);

1394

    if (c == '#' || 

c == '\n'3.33M

) {

Branch (1394:13): [True: 33.8k, False: 3.33M] Branch (1394:25): [True: 2.62M, False: 707k]

1395

        /* Lines with only whitespace and/or comments

1396

           shouldn't affect the indentation and are

1397

           not passed to the parser as NEWLINE tokens,

1398

           except *totally* empty lines in interactive

1399

           mode, which signal the end of a command group. */

1400

        if (col == 0 && 

c == '\n'61.5k

&&

tok->prompt != NULL49.4k

) {

Branch (1400:17): [True: 61.5k, False: 2.59M] Branch (1400:29): [True: 49.4k, False: 12.1k] Branch (1400:42): [True: 0, False: 49.4k]

1401

            blankline = 0; /* Let it through */

1402

        }

1403

        else if (tok->prompt != NULL && 

tok->lineno == 10

) {

Branch (1403:22): [True: 0, False: 2.65M] Branch (1403:45): [True: 0, False: 0]

1404

            /* In interactive mode, if the first line contains

1405

               only spaces and/or a comment, let it through. */

1406

            blankline = 0;

1407

            col = altcol = 0;

1408

        }

1409

        else {

1410

            blankline = 1; /* Ignore completely */

1411

        }

1412

        /* We can't jump back right here since we still

1413

           may need to skip to the end of a comment */

1414

    }

1415

    if (!blankline && 

tok->level == 0707k

) {

Branch (1415:13): [True: 707k, False: 2.65M] Branch (1415:27): [True: 593k, False: 114k]

1416

        col = cont_line_col ? 

cont_line_col10

:

col593k

;

Branch (1416:19): [True: 10, False: 593k]

1417

        altcol = cont_line_col ? 

cont_line_col10

:

altcol593k

;

Branch (1417:22): [True: 10, False: 593k]

1418

        if (col == tok->indstack[tok->indent]) {

Branch (1418:17): [True: 445k, False: 147k]

1419

            /* No change */

1420

            if (altcol != tok->altindstack[tok->indent]) {

Branch (1420:21): [True: 2, False: 445k]

1421

                return indenterror(tok);

1422

            }

1423

        }

1424

        else if (col > tok->indstack[tok->indent]) {

Branch (1424:22): [True: 81.6k, False: 65.7k]

1425

            /* Indent -- always one */

1426

            if (tok->indent+1 >= MAXINDENT) {

Branch (1426:21): [True: 0, False: 81.6k]

1427

                tok->done = E_TOODEEP;

1428

                tok->cur = tok->inp;

1429

                return ERRORTOKEN;

1430

            }

1431

            if (altcol <= tok->altindstack[tok->indent]) {

Branch (1431:21): [True: 0, False: 81.6k]

1432

                return indenterror(tok);

1433

            }

1434

            tok->pendin++;

1435

            tok->indstack[++tok->indent] = col;

1436

            tok->altindstack[tok->indent] = altcol;

1437

        }

1438

        else /* col < tok->indstack[tok->indent] */ {

1439

            /* Dedent -- any number, must be consistent */

1440

            while (tok->indent > 0 &&

Branch (1440:24): [True: 132k, False: 14.9k]

1441

col < tok->indstack[tok->indent]132k

) {

Branch (1441:21): [True: 81.6k, False: 50.7k]

1442

                tok->pendin--;

1443

                tok->indent--;

1444

            }

1445

            if (col != tok->indstack[tok->indent]) {

Branch (1445:21): [True: 6, False: 65.7k]

1446

                tok->done = E_DEDENT;

1447

                tok->cur = tok->inp;

1448

                return ERRORTOKEN;

1449

            }

1450

            if (altcol != tok->altindstack[tok->indent]) {

Branch (1450:21): [True: 0, False: 65.7k]

1451

                return indenterror(tok);

1452

            }

1453

        }

1454

    }

1455

}

1456

1457

tok->start = tok->cur;

1458

1459

/* Return pending indents/dedents */

1460

if (tok->pendin != 0) {

Branch (1460:9): [True: 163k, False: 10.0M]

1461

    if (tok->pendin < 0) {

Branch (1461:13): [True: 81.6k, False: 81.6k]

1462

        tok->pendin++;

1463

        return DEDENT;

1464

    }

1465

    else {

1466

        tok->pendin--;

1467

        return INDENT;

1468

    }

1469

}

1470

1471

/* Peek ahead at the next character */

1472

c = tok_nextc(tok);

1473

tok_backup(tok, c);

1474

/* Check if we are closing an async function */

1475

if (tok->async_def

Branch (1475:9): [True: 141, False: 10.0M]

1476

    && 

!blankline141

Branch (1476:12): [True: 135, False: 6]

1477

    /* Due to some implementation artifacts of type comments,

1478

     * a TYPE_COMMENT at the start of a function won't set an

1479

     * indentation level and it will produce a NEWLINE after it.

1480

     * To avoid spuriously ending an async function due to this,

1481

     * wait until we have some non-newline char in front of us. */

1482

    && 

c != '\n'135

Branch (1482:12): [True: 114, False: 21]

1483

    && 

tok->level == 0114

Branch (1483:12): [True: 75, False: 39]

1484

    /* There was a NEWLINE after ASYNC DEF,

1485

       so we're past the signature. */

1486

    && 

tok->async_def_nl75

Branch (1486:12): [True: 36, False: 39]

1487

    /* Current indentation level is less than where

1488

       the async function was defined */

1489

    && 

tok->async_def_indent >= tok->indent36

)

Branch (1489:12): [True: 9, False: 27]

1490

{

1491

    tok->async_def = 0;

1492

    tok->async_def_indent = 0;

1493

    tok->async_def_nl = 0;

1494

}

1495

1496

again:

1497

tok->start = NULL;

1498

/* Skip spaces */

1499

do {

1500

    c = tok_nextc(tok);

1501

} while (c == ' ' || 

c == '\t'10.0M

||

c == '\014'10.0M

);

Branch (1501:14): [True: 1.18M, False: 10.0M] Branch (1501:26): [True: 4, False: 10.0M] Branch (1501:39): [True: 0, False: 10.0M]

1502

1503

/* Set start of current token */

1504

tok->start = tok->cur - 1;

1505

1506

/* Skip comment, unless it's a type comment */

1507

if (c == '#') {

Branch (1507:9): [True: 38.2k, False: 9.99M]

1508

    const char *prefix, *p, *type_start;

1509

1510

    while (c != EOF && 

c != '\n'1.73M

) {

Branch (1510:16): [True: 1.73M, False: 1] Branch (1510:28): [True: 1.69M, False: 38.2k]

1511

        c = tok_nextc(tok);

1512

    }

1513

1514

    if (tok->type_comments) {

Branch (1514:13): [True: 611, False: 37.6k]

1515

        p = tok->start;

1516

        prefix = type_comment_prefix;

1517

        while (*prefix && 

p < tok->cur4.88k

) {

Branch (1517:20): [True: 4.88k, False: 611] Branch (1517:31): [True: 4.88k, False: 0]

1518

            if (*prefix == ' ') {

Branch (1518:21): [True: 1.22k, False: 3.66k]

1519

                while (*p == ' ' || 

*p == '\t'1.22k

) {

Branch (1519:28): [True: 1.22k, False: 1.22k] Branch (1519:41): [True: 0, False: 1.22k]

1520

                    p++;

1521

                }

1522

            } else if (*prefix == *p) {

Branch (1522:28): [True: 3.66k, False: 0]

1523

                p++;

1524

            } else {

1525

                break;

1526

            }

1527

1528

            prefix++;

1529

        }

1530

1531

        /* This is a type comment if we matched all of type_comment_prefix. */

1532

        if (!*prefix) {

Branch (1532:17): [True: 611, False: 0]

1533

            int is_type_ignore = 1;

1534

            const char *ignore_end = p + 6;

1535

            tok_backup(tok, c);  /* don't eat the newline or EOF */

1536

1537

            type_start = p;

1538

1539

            /* A TYPE_IGNORE is "type: ignore" followed by the end of the token

1540

             * or anything ASCII and non-alphanumeric. */

1541

            is_type_ignore = (

1542

                tok->cur >= ignore_end && 

memcmp(p, "ignore", 6) == 0165

Branch (1542:21): [True: 165, False: 446] Branch (1542:47): [True: 92, False: 73]

1543

                && 

!(92

tok->cur > ignore_end92

Branch (1543:26): [True: 60, False: 32]

1544

                     && 

(60

(unsigned char)ignore_end[0] >= 12860

||

Py_ISALNUM51

(ignore_end[0]))));

Branch (1544:30): [True: 9, False: 51]

1545

1546

            if (is_type_ignore) {

Branch (1546:21): [True: 74, False: 537]

1547

                *p_start = ignore_end;

1548

                *p_end = tok->cur;

1549

1550

                /* If this type ignore is the only thing on the line, consume the newline also. */

1551

                if (blankline) {

Branch (1551:25): [True: 0, False: 74]

1552

                    tok_nextc(tok);

1553

                    tok->atbol = 1;

1554

                }

1555

                return TYPE_IGNORE;

1556

            } else {

1557

                *p_start = type_start;  /* after type_comment_prefix */

1558

                *p_end = tok->cur;

1559

                return TYPE_COMMENT;

1560

            }

1561

        }

1562

    }

1563

}

1564

1565

if (tok->done == E_INTERACT_STOP) {

Branch (1565:9): [True: 0, False: 10.0M]

1566

    return ENDMARKER;

1567

}

1568

1569

/* Check for EOF and errors now */

1570

if (c == EOF) {

Branch (1570:9): [True: 55.7k, False: 9.97M]

1571

    if (tok->level) {

Branch (1571:13): [True: 185, False: 55.5k]

1572

        return ERRORTOKEN;

1573

    }

1574

    return tok->done == E_EOF ? ENDMARKER : 

ERRORTOKEN0

;

Branch (1574:16): [True: 55.5k, False: 0]

1575

}

1576

1577

/* Identifier (most frequent token!) */

1578

nonascii = 0;

1579

if (is_potential_identifier_start(c)) {

1580

    /* Process the various legal combinations of b"", r"", u"", and f"". */

1581

    int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;

1582

    while (1) {

Branch (1582:16): [Folded - Ignored]

1583

        if (!(saw_b || 

saw_u2.33M

||

saw_f2.32M

) &&

(2.27M

c == 'b'2.27M

||

c == 'B'1.74M

))

Branch (1583:19): [True: 524k, False: 2.33M] Branch (1583:28): [True: 6.35k, False: 2.32M] Branch (1583:37): [True: 52.4k, False: 2.27M] Branch (1583:48): [True: 523k, False: 1.74M] Branch (1583:60): [True: 3.92k, False: 1.74M]

1584

            saw_b = 1;

1585

        /* Since this is a backwards compatibility support literal we don't

1586

           want to support it in arbitrary order like byte literals. */

1587

        else if (!(saw_b || 

saw_u1.80M

||

saw_r1.79M

||

saw_f1.74M

)

Branch (1587:24): [True: 524k, False: 1.80M] Branch (1587:33): [True: 6.35k, False: 1.79M] Branch (1587:42): [True: 53.3k, False: 1.74M] Branch (1587:51): [True: 47.2k, False: 1.69M]

1588

                 && 

(1.69M

c == 'u'1.69M

||

c == 'U'1.69M

)) {

Branch (1588:26): [True: 5.14k, False: 1.69M] Branch (1588:37): [True: 1.25k, False: 1.69M]

1589

            saw_u = 1;

1590

        }

1591

        /* ur"" and ru"" are not supported */

1592

        else if (!(saw_r || 

saw_u2.26M

) &&

(2.26M

c == 'r'2.26M

||

c == 'R'2.20M

)) {

Branch (1592:24): [True: 54.7k, False: 2.26M] Branch (1592:33): [True: 6.35k, False: 2.26M] Branch (1592:44): [True: 54.0k, False: 2.20M] Branch (1592:56): [True: 1.55k, False: 2.20M]

1593

            saw_r = 1;

1594

        }

1595

        else if (!(saw_f || 

saw_b2.21M

||

saw_u1.69M

) &&

(1.68M

c == 'f'1.68M

||

c == 'F'1.64M

)) {

Branch (1595:24): [True: 47.3k, False: 2.21M] Branch (1595:33): [True: 523k, False: 1.69M] Branch (1595:42): [True: 6.35k, False: 1.68M] Branch (1595:53): [True: 46.8k, False: 1.64M] Branch (1595:65): [True: 4.91k, False: 1.63M]

1596

            saw_f = 1;

1597

        }

1598

        else {

1599

            break;

1600

        }

1601

        c = tok_nextc(tok);

1602

        if (c == '"' || 

c == '''639k

) {

Branch (1602:17): [True: 2.49k, False: 639k] Branch (1602:29): [True: 7.24k, False: 631k]

1603

            goto letter_quote;

1604

        }

1605

    }

1606

    while (2.21M

is_potential_identifier_char(c)) {

1607

        if (c >= 128) {

Branch (1607:17): [True: 98, False: 5.69M]

1608

            nonascii = 1;

1609

        }

1610

        c = tok_nextc(tok);

1611

    }

1612

    tok_backup(tok, c);

1613

    if (nonascii && 

!verify_identifier(tok)48

) {

Branch (1613:13): [True: 48, False: 2.21M] Branch (1613:25): [True: 12, False: 36]

1614

        return ERRORTOKEN;

1615

    }

1616

1617

    *p_start = tok->start;

1618

    *p_end = tok->cur;

1619

1620

    /* async/await parsing block. */

1621

    if (tok->cur - tok->start == 5 && 

tok->start[0] == 'a'75.4k

) {

Branch (1621:13): [True: 75.4k, False: 2.13M] Branch (1621:43): [True: 2.20k, False: 73.2k]

1622

        /* May be an 'async' or 'await' token.  For Python 3.7 or

1623

           later we recognize them unconditionally.  For Python

1624

           3.5 or 3.6 we recognize 'async' in front of 'def', and

1625

           either one inside of 'async def'.  (Technically we

1626

           shouldn't recognize these at all for 3.4 or earlier,

1627

           but there's no *valid* Python 3.4 code that would be

1628

           rejected, and async functions will be rejected in a

1629

           later phase.) */

1630

        if (!tok->async_hacks || 

tok->async_def24

) {

Branch (1630:17): [True: 2.18k, False: 24] Branch (1630:38): [True: 9, False: 15]

1631

            /* Always recognize the keywords. */

1632

            if (memcmp(tok->start, "async", 5) == 0) {

Branch (1632:21): [True: 957, False: 1.23k]

1633

                return ASYNC;

1634

            }

1635

            if (memcmp(tok->start, "await", 5) == 0) {

Branch (1635:21): [True: 219, False: 1.01k]

1636

                return AWAIT;

1637

            }

1638

        }

1639

        else if (memcmp(tok->start, "async", 5) == 0) {

Branch (1639:22): [True: 12, False: 3]

1640

            /* The current token is 'async'.

1641

               Look ahead one token to see if that is 'def'. */

1642

1643

            struct tok_state ahead_tok;

1644

            const char *ahead_tok_start = NULL;

1645

            const char *ahead_tok_end = NULL;

1646

            int ahead_tok_kind;

1647

1648

            memcpy(&ahead_tok, tok, sizeof(ahead_tok));

1649

            ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,

1650

                                     &ahead_tok_end);

1651

1652

            if (ahead_tok_kind == NAME

Branch (1652:21): [True: 9, False: 3]

1653

                && 

ahead_tok.cur - ahead_tok.start == 39

Branch (1653:24): [True: 9, False: 0]

1654

                && 

memcmp(ahead_tok.start, "def", 3) == 09

)

Branch (1654:24): [True: 9, False: 0]

1655

            {

1656

                /* The next token is going to be 'def', so instead of

1657

                   returning a plain NAME token, return ASYNC. */

1658

                tok->async_def_indent = tok->indent;

1659

                tok->async_def = 1;

1660

                return ASYNC;

1661

            }

1662

        }

1663

    }

1664

1665

    return NAME;

1666

}

1667

1668

/* Newline */

1669

if (c == '\n') {

Branch (1669:9): [True: 3.31M, False: 4.44M]

1670

    tok->atbol = 1;

1671

    if (blankline || 

tok->level > 0651k

) {

Branch (1671:13): [True: 2.65M, False: 651k] Branch (1671:26): [True: 114k, False: 536k]

1672

        goto nextline;

1673

    }

1674

    *p_start = tok->start;

1675

    *p_end = tok->cur - 1; /* Leave '\n' out of the string */

1676

    tok->cont_line = 0;

1677

    if (tok->async_def) {

Branch (1677:13): [True: 21, False: 536k]

1678

        /* We're somewhere inside an 'async def' function, and

1679

           we've encountered a NEWLINE after its signature. */

1680

        tok->async_def_nl = 1;

1681

    }

1682

    return NEWLINE;

1683

}

1684

1685

/* Period or number starting with period? */

1686

if (c == '.') {

Branch (1686:9): [True: 423k, False: 4.01M]

1687

    c = tok_nextc(tok);

1688

    if (isdigit(c)) {

1689

        goto fraction;

1690

    } else if (c == '.') {

Branch (1690:20): [True: 1.40k, False: 422k]

1691

        c = tok_nextc(tok);

1692

        if (c == '.') {

Branch (1692:17): [True: 1.39k, False: 15]

1693

            *p_start = tok->start;

1694

            *p_end = tok->cur;

1695

            return ELLIPSIS;

1696

        }

1697

        else {

1698

            tok_backup(tok, c);

1699

        }

1700

        tok_backup(tok, '.');

1701

    }

1702

    else {

1703

        tok_backup(tok, c);

1704

    }

1705

    *p_start = tok->start;

1706

    *p_end = tok->cur;

1707

    return DOT;

1708

}

1709

1710

/* Number */

1711

if (isdigit(c)) {

1712

    if (c == '0') {

Branch (1712:13): [True: 320k, False: 325k]

1713

        /* Hex, octal or binary -- maybe. */

1714

        c = tok_nextc(tok);

1715

        if (c == 'x' || 

c == 'X'318k

) {

Branch (1715:17): [True: 2.30k, False: 318k] Branch (1715:29): [True: 2, False: 318k]

1716

            /* Hex */

1717

            c = tok_nextc(tok);

1718

            do {

1719

                if (c == '_') {

Branch (1719:25): [True: 17, False: 2.30k]

1720

                    c = tok_nextc(tok);

1721

                }

1722

                if (!isxdigit(c)) {

Branch (1722:25): [True: 16, False: 2.30k]

1723

                    tok_backup(tok, c);

1724

                    return syntaxerror(tok, "invalid hexadecimal literal");

1725

                }

1726

do 2.30k

{

1727

                    c = tok_nextc(tok);

1728

                } while (isxdigit(c));

1729

            } while (c == '_');

Branch (1729:26): [True: 13, False: 2.29k]

1730

            if (!verify_end_of_number(tok, c, "hexadecimal")) {

Branch (1730:21): [True: 12, False: 2.28k]

1731

                return ERRORTOKEN;

1732

            }

1733

        }

1734

        else if (c == 'o' || 

c == 'O'318k

) {

Branch (1734:22): [True: 142, False: 318k] Branch (1734:34): [True: 3, False: 318k]

1735

            /* Octal */

1736

            c = tok_nextc(tok);

1737

            do {

1738

                if (c == '_') {

Branch (1738:25): [True: 12, False: 143]

1739

                    c = tok_nextc(tok);

1740

                }

1741

                if (c < '0' || 

c >= '8'150

) {

Branch (1741:25): [True: 5, False: 150] Branch (1741:36): [True: 7, False: 143]

1742

                    if (isdigit(c)) {

1743

                        return syntaxerror(tok,

1744

                                "invalid digit '%c' in octal literal", c);

1745

                    }

1746

                    else {

1747

                        tok_backup(tok, c);

1748

                        return syntaxerror(tok, "invalid octal literal");

1749

                    }

1750

                }

1751

do 143

{

1752

                    c = tok_nextc(tok);

1753

                } while ('0' <= c && 

c < '8'527

);

Branch (1753:30): [True: 527, False: 95] Branch (1753:42): [True: 479, False: 48]

1754

            } while (c == '_');

Branch (1754:26): [True: 10, False: 133]

1755

            if (isdigit(c)) {

1756

                return syntaxerror(tok,

1757

                        "invalid digit '%c' in octal literal", c);

1758

            }

1759

            if (!verify_end_of_number(tok, c, "octal")) {

Branch (1759:21): [True: 11, False: 119]

1760

                return ERRORTOKEN;

1761

            }

1762

        }

1763

        else if (c == 'b' || 

c == 'B'318k

) {

Branch (1763:22): [True: 54, False: 318k] Branch (1763:34): [True: 3, False: 318k]

1764

            /* Binary */

1765

            c = tok_nextc(tok);

1766

            do {

1767

                if (c == '_') {

Branch (1767:25): [True: 10, False: 55]

1768

                    c = tok_nextc(tok);

1769

                }

1770

                if (c != '0' && 

c != '1'59

) {

Branch (1770:25): [True: 59, False: 6] Branch (1770:37): [True: 12, False: 47]

1771

                    if (isdigit(c)) {

1772

                        return syntaxerror(tok,

1773

                                "invalid digit '%c' in binary literal", c);

1774

                    }

1775

                    else {

1776

                        tok_backup(tok, c);

1777

                        return syntaxerror(tok, "invalid binary literal");

1778

                    }

1779

                }

1780

do 53

{

1781

                    c = tok_nextc(tok);

1782

                } while (c == '0' || 

c == '1'384

);

Branch (1782:30): [True: 235, False: 384] Branch (1782:42): [True: 331, False: 53]

1783

            } while (c == '_');

Branch (1783:26): [True: 8, False: 45]

1784

            if (isdigit(c)) {

1785

                return syntaxerror(tok,

1786

                        "invalid digit '%c' in binary literal", c);

1787

            }

1788

            if (!verify_end_of_number(tok, c, "binary")) {

Branch (1788:21): [True: 11, False: 32]

1789

                return ERRORTOKEN;

1790

            }

1791

        }

1792

        else {

1793

            int nonzero = 0;

1794

            /* maybe old-style octal; c is first char of it */

1795

            /* in any case, allow '0' as a literal */

1796

            while (1) {

Branch (1796:24): [Folded - Ignored]

1797

                if (c == '_') {

Branch (1797:25): [True: 13, False: 318k]

1798

                    c = tok_nextc(tok);

1799

                    if (!isdigit(c)) {

Branch (1799:29): [True: 4, False: 9]

1800

                        tok_backup(tok, c);

1801

                        return syntaxerror(tok, "invalid decimal literal");

1802

                    }

1803

                }

1804

                if (c != '0') {

Branch (1804:25): [True: 318k, False: 139]

1805

                    break;

1806

                }

1807

                c = tok_nextc(tok);

1808

            }

1809

            char* zeros_end = tok->cur;

1810

            if (isdigit(c)) {

1811

                nonzero = 1;

1812

                c = tok_decimal_tail(tok);

1813

                if (c == 0) {

Branch (1813:25): [True: 0, False: 26]

1814

                    return ERRORTOKEN;

1815

                }

1816

            }

1817

            if (c == '.') {

Branch (1817:21): [True: 445, False: 317k]

1818

                c = tok_nextc(tok);

1819

                goto fraction;

1820

            }

1821

            else if (c == 'e' || 

c == 'E'317k

) {

Branch (1821:26): [True: 11, False: 317k] Branch (1821:38): [True: 2, False: 317k]

1822

                goto exponent;

1823

            }

1824

            else if (c == 'j' || 

c == 'J'317k

) {

Branch (1824:26): [True: 128, False: 317k] Branch (1824:38): [True: 0, False: 317k]

1825

                goto imaginary;

1826

            }

1827

            else if (nonzero) {

Branch (1827:26): [True: 13, False: 317k]

1828

                /* Old-style octal: now disallowed. */

1829

                tok_backup(tok, c);

1830

                return syntaxerror_known_range(

1831

                        tok, (int)(tok->start + 1 - tok->line_start),

1832

                        (int)(zeros_end - tok->line_start),

1833

                        "leading zeros in decimal integer "

1834

                        "literals are not permitted; "

1835

                        "use an 0o prefix for octal integers");

1836

            }

1837

            if (!verify_end_of_number(tok, c, "decimal")) {

Branch (1837:21): [True: 7, False: 317k]

1838

                return ERRORTOKEN;

1839

            }

1840

        }

1841

    }

1842

    else {

1843

        /* Decimal */

1844

        c = tok_decimal_tail(tok);

1845

        if (c == 0) {

Branch (1845:17): [True: 11, False: 325k]

1846

            return ERRORTOKEN;

1847

        }

1848

        {

1849

            /* Accept floating point numbers. */

1850

            if (c == '.') {

Branch (1850:21): [True: 2.05k, False: 323k]

1851

                c = tok_nextc(tok);

1852

    fraction:

1853

                /* Fraction */

1854

                if (isdigit(c)) {

1855

                    c = tok_decimal_tail(tok);

1856

                    if (c == 0) {

Branch (1856:29): [True: 10, False: 2.42k]

1857

                        return ERRORTOKEN;

1858

                    }

1859

                }

1860

            }

1861

            if (c == 'e' || 

c == 'E'324k

) {

Branch (1861:21): [True: 1.52k, False: 324k] Branch (1861:33): [True: 1.02k, False: 323k]

1862

                int e;

1863

              exponent:

1864

                e = c;

1865

                /* Exponent part */

1866

                c = tok_nextc(tok);

1867

                if (c == '+' || 

c == '-'1.95k

) {

Branch (1867:25): [True: 604, False: 1.95k] Branch (1867:37): [True: 1.23k, False: 714]

1868

                    c = tok_nextc(tok);

1869

                    if (!isdigit(c)) {

Branch (1869:29): [True: 8, False: 1.83k]

1870

                        tok_backup(tok, c);

1871

                        return syntaxerror(tok, "invalid decimal literal");

1872

                    }

1873

                } else 

if (714

!isdigit714

(c)) {

Branch (1873:32): [True: 15, False: 699]

1874

                    tok_backup(tok, c);

1875

                    if (!verify_end_of_number(tok, e, "decimal")) {

Branch (1875:29): [True: 10, False: 5]

1876

                        return ERRORTOKEN;

1877

                    }

1878

                    tok_backup(tok, e);

1879

                    *p_start = tok->start;

1880

                    *p_end = tok->cur;

1881

                    return NUMBER;

1882

                }

1883

                c = tok_decimal_tail(tok);

1884

                if (c == 0) {

Branch (1884:25): [True: 6, False: 2.52k]

1885

                    return ERRORTOKEN;

1886

                }

1887

            }

1888

            if (c == 'j' || 

c == 'J'325k

) {

Branch (1888:21): [True: 490, False: 325k] Branch (1888:33): [True: 0, False: 325k]

1889

                /* Imaginary part */

1890

    imaginary:

1891

                c = tok_nextc(tok);

1892

                if (!verify_end_of_number(tok, c, "imaginary")) {

Branch (1892:25): [True: 10, False: 608]

1893

                    return ERRORTOKEN;

1894

                }

1895

            }

1896

            else if (!verify_end_of_number(tok, c, "decimal")) {

Branch (1896:26): [True: 27, False: 325k]

1897

                return ERRORTOKEN;

1898

            }

1899

        }

1900

    }

1901

    tok_backup(tok, c);

1902

    *p_start = tok->start;

1903

    *p_end = tok->cur;

1904

    return NUMBER;

1905

}

1906

1907

letter_quote:

1908

/* String */

1909

if (c == '\'' || 

c == '"'3.27M

) {

Branch (1909:9): [True: 107k, False: 3.27M] Branch (1909:22): [True: 98.8k, False: 3.17M]

1910

    int quote = c;

1911

    int quote_size = 1;             /* 1 or 3 */

1912

    int end_quote_size = 0;

1913

1914

    /* Nodes of type STRING, especially multi line strings

1915

       must be handled differently in order to get both

1916

       the starting line number and the column offset right.

1917

       (cf. issue 16806) */

1918

    tok->first_lineno = tok->lineno;

1919

    tok->multi_line_start = tok->line_start;

1920

1921

    /* Find the quote size and start of string */

1922

    c = tok_nextc(tok);

1923

    if (c == quote) {

Branch (1923:13): [True: 76.2k, False: 129k]

1924

        c = tok_nextc(tok);

1925

        if (c == quote) {

Branch (1925:17): [True: 8.52k, False: 67.6k]

1926

            quote_size = 3;

1927

        }

1928

        else {

1929

            end_quote_size = 1;     /* empty string found */

1930

        }

1931

    }

1932

    if (c != quote) {

Branch (1932:13): [True: 197k, False: 8.52k]

1933

        tok_backup(tok, c);

1934

    }

1935

1936

    /* Get rest of string */

1937

    while (end_quote_size != quote_size) {

Branch (1937:16): [True: 4.20M, False: 206k]

1938

        c = tok_nextc(tok);

1939

        if (c == EOF || 

(4.20M

quote_size == 14.20M

&&

c == '\n'1.72M

)) {

Branch (1939:17): [True: 11, False: 4.20M] Branch (1939:30): [True: 1.72M, False: 2.48M] Branch (1939:49): [True: 5, False: 1.72M]

1940

            assert(tok->multi_line_start != NULL);

1941

            // shift the tok_state's location into

1942

            // the start of string, and report the error

1943

            // from the initial quote character

1944

            tok->cur = (char *)tok->start;

1945

            tok->cur++;

1946

            tok->line_start = tok->multi_line_start;

1947

            int start = tok->lineno;

1948

            tok->lineno = tok->first_lineno;

1949

            if (quote_size == 3) {

Branch (1949:21): [True: 5, False: 11]

1950

                syntaxerror(tok, "unterminated triple-quoted string literal"

1951

                                 " (detected at line %d)", start);

1952

                if (c != '\n') {

Branch (1952:25): [True: 5, False: 0]

1953

                    tok->done = E_EOFS;

1954

                }

1955

                return ERRORTOKEN;

1956

            }

1957

            else {

1958

                syntaxerror(tok, "unterminated string literal (detected at"

1959

                                 " line %d)", start);

1960

                if (c != '\n') {

Branch (1960:25): [True: 6, False: 5]

1961

                    tok->done = E_EOLS;

1962

                }

1963

                return ERRORTOKEN;

1964

            }

1965

        }

1966

        if (c == quote) {

Branch (1966:17): [True: 160k, False: 4.04M]

1967

            end_quote_size += 1;

1968

        }

1969

        else {

1970

            end_quote_size = 0;

1971

            if (c == '\\') {

Branch (1971:21): [True: 29.6k, False: 4.01M]

1972

                tok_nextc(tok);  /* skip escaped char */

1973

            }

1974

        }

1975

    }

1976

1977

    *p_start = tok->start;

1978

    *p_end = tok->cur;

1979

    return STRING;

1980

}

1981

1982

/* Line continuation */

1983

if (c == '\\') {

Branch (1983:9): [True: 463, False: 3.17M]

1984

    if ((c = tok_continuation_line(tok)) == -1) {

Branch (1984:13): [True: 15, False: 448]

1985

        return ERRORTOKEN;

1986

    }

1987

    tok->cont_line = 1;

1988

    goto again; /* Read next line */

1989

}

1990

1991

/* Check for two-character token */

1992

{

1993

    int c2 = tok_nextc(tok);

1994

    int token = _PyToken_TwoChars(c, c2);

1995

    if (token != OP) {

Branch (1995:13): [True: 18.4k, False: 3.15M]

1996

        int c3 = tok_nextc(tok);

1997

        int token3 = _PyToken_ThreeChars(c, c2, c3);

1998

        if (token3 != OP) {

Branch (1998:17): [True: 89, False: 18.3k]

1999

            token = token3;

2000

        }

2001

        else {

2002

            tok_backup(tok, c3);

2003

        }

2004

        *p_start = tok->start;

2005

        *p_end = tok->cur;

2006

        return token;

2007

    }

2008

    tok_backup(tok, c2);

2009

}

2010

2011

/* Keep track of parentheses nesting level */

2012

switch (c) {

Branch (2012:13): [True: 1.35M, False: 1.80M]

2013

case '(':

Branch (2013:5): [True: 565k, False: 2.58M]

2014

case '[':

Branch (2014:5): [True: 332k, False: 2.82M]

2015

case '{':

Branch (2015:5): [True: 4.78k, False: 3.15M]

2016

    if (tok->level >= MAXLEVEL) {

Branch (2016:13): [True: 1, False: 902k]

2017

        return syntaxerror(tok, "too many nested parentheses");

2018

    }

2019

    tok->parenstack[tok->level] = c;

2020

    tok->parenlinenostack[tok->level] = tok->lineno;

2021

    tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);

2022

    tok->level++;

2023

    break;

2024

case ')':

Branch (2024:5): [True: 565k, False: 2.59M]

2025

case ']':

Branch (2025:5): [True: 332k, False: 2.82M]

2026

case '}':

Branch (2026:5): [True: 4.72k, False: 3.15M]

2027

    if (!tok->level) {

Branch (2027:13): [True: 12, False: 902k]

2028

        return syntaxerror(tok, "unmatched '%c'", c);

2029

    }

2030

    tok->level--;

2031

    int opening = tok->parenstack[tok->level];

2032

    if (!((opening == '(' && 

c == ')'565k

) ||

Branch (2032:16): [True: 565k, False: 336k] Branch (2032:34): [True: 565k, False: 2]

2033

(336k

opening == '['336k

&&

c == ']'332k

) ||

Branch (2033:16): [True: 332k, False: 4.72k] Branch (2033:34): [True: 332k, False: 2]

2034

(4.72k

opening == '{'4.72k

&&

c == '}'4.72k

)))

Branch (2034:16): [True: 4.72k, False: 4] Branch (2034:34): [True: 4.72k, False: 1]

2035

    {

2036

        if (tok->parenlinenostack[tok->level] != tok->lineno) {

Branch (2036:17): [True: 0, False: 5]

2037

            return syntaxerror(tok,

2038

                    "closing parenthesis '%c' does not match "

2039

                    "opening parenthesis '%c' on line %d",

2040

                    c, opening, tok->parenlinenostack[tok->level]);

2041

        }

2042

        else {

2043

            return syntaxerror(tok,

2044

                    "closing parenthesis '%c' does not match "

2045

                    "opening parenthesis '%c'",

2046

                    c, opening);

2047

        }

2048

    }

2049

    break;

2050

}

2051

2052

if (!Py_UNICODE_ISPRINTABLE(c)) {

Branch (2052:9): [True: 1, False: 3.15M]

2053

    char hex[9];

2054

    (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);

2055

    return syntaxerror(tok, "invalid non-printable character U+%s", hex);

2056

}

2057

2058

/* Punctuation character */

2059

*p_start = tok->start;

2060

*p_end = tok->cur;

2061

return _PyToken_OneChar(c);

2062

}

2063

2064

int

2065

_PyTokenizer_Get(struct tok_state *tok,

2066

             const char **p_start, const char **p_end)

2067

{

2068

int result = tok_get(tok, p_start, p_end);

2069

if (tok->decoding_erred) {

Branch (2069:9): [True: 0, False: 7.42M]

2070

    result = ERRORTOKEN;

2071

    tok->done = E_DECODE;

2072

}

2073

return result;

2074

}

2075

2076

#if defined(wasi) || (defined(EMSCRIPTEN) && (EMSCRIPTEN_major >= 3))

2077

// fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's

2078

// dup() emulation with open() is slow.

2079

typedef union {

2080

void *cookie;

2081

int fd;

2082

} borrowed;

2083

2084

static ssize_t

2085

borrow_read(void *cookie, char *buf, size_t size)

2086

{

2087

borrowed b = {.cookie = cookie};

2088

return read(b.fd, (void *)buf, size);

2089

}

2090

2091

static FILE *

2092

fdopen_borrow(int fd) {

2093

// supports only reading. seek fails. close and write are no-ops.

2094

cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL};

2095

borrowed b = {.fd = fd};

2096

return fopencookie(b.cookie, "r", io_cb);

2097

}

2098

#else

2099

static FILE *

2100

fdopen_borrow(int fd) {

2101

fd = _Py_dup(fd);

2102

if (fd < 0) {

Branch (2102:9): [True: 0, False: 256]

2103

    return NULL;

2104

}

2105

return fdopen(fd, "r");

2106

}

2107

#endif

2108

2109

/* Get the encoding of a Python file. Check for the coding cookie and check if

2110

the file starts with a BOM.

2111

2112

_PyTokenizer_FindEncodingFilename() returns NULL when it can't find the

2113

encoding in the first or second line of the file (in which case the encoding

2114

should be assumed to be UTF-8).

2115

2116

The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed

2117

by the caller. */

2118

2119

char *

2120

_PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)

2121

{

2122

struct tok_state *tok;

2123

FILE *fp;

2124

const char *p_start = NULL;

2125

const char *p_end = NULL;

2126

char *encoding = NULL;

2127

2128

fp = fdopen_borrow(fd);

2129

if (fp == NULL) {

Branch (2129:9): [True: 0, False: 256]

2130

    return NULL;

2131

}

2132

tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);

2133

if (tok == NULL) {

Branch (2133:9): [True: 0, False: 256]

2134

    fclose(fp);

2135

    return NULL;

2136

}

2137

if (filename != NULL) {

Branch (2137:9): [True: 256, False: 0]

2138

    Py_INCREF(filename);

2139

    tok->filename = filename;

2140

}

2141

else {

2142

    tok->filename = PyUnicode_FromString("<string>");

2143

    if (tok->filename == NULL) {

Branch (2143:13): [True: 0, False: 0]

2144

        fclose(fp);

2145

        _PyTokenizer_Free(tok);

2146

        return encoding;

2147

    }

2148

}

2149

while (256

tok->lineno < 2 &&

tok->done == 622

E_OK622

) {

Branch (2149:12): [True: 622, False: 254] Branch (2149:31): [True: 620, False: 2]

2150

    _PyTokenizer_Get(tok, &p_start, &p_end);

2151

}

2152

fclose(fp);

2153

if (tok->encoding) {

Branch (2153:9): [True: 0, False: 256]

2154

    encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);

2155

    if (encoding) {

Branch (2155:13): [True: 0, False: 0]

2156

        strcpy(encoding, tok->encoding);

2157

    }

2158

}

2159

_PyTokenizer_Free(tok);

2160

return encoding;

2161

}

2162

2163

#ifdef Py_DEBUG

2164

void

2165

tok_dump(int type, char *start, char *end)

2166

{

2167

fprintf(stderr, "%s", _PyParser_TokenNames[type]);

2168

if (type == NAME || type == NUMBER || type == STRING || type == OP)

2169

    fprintf(stderr, "(%.*s)", (int)(end - start), start);

2170

}

2171

#endif // Py_DEBUG