(original) (raw)

Line

Count

Source (jump to first uncovered line)

/* Tokenizer implementation */

#define PY_SSIZE_T_CLEAN

#include "Python.h"

#include "pycore_call.h" // _PyObject_CallNoArgs()

#include <ctype.h>

#include <assert.h>

#include "tokenizer.h"

#include "errcode.h"

#include "unicodeobject.h"

#include "bytesobject.h"

#include "fileobject.h"

#include "abstract.h"

/* Alternate tab spacing */

#define ALTTABSIZE 1

#define is_potential_identifier_start(c) (\

          (c >= 'a' &&

c <= 'z'2.10M

||

(7.88M

c >= 'A'7.88M

c <= 'Z'810k

||

c == '_'7.81M

||

(c >= 128)7.75M

)

#define is_potential_identifier_char(c) (\

          (c >= 'a' &&

c <= 'z'5.12M

||

(3.43M

c >= 'A'3.43M

c <= 'Z'873k

||

(3.13M

c >= '0'3.13M

c <= '9'944k

||

c == '_'3.09M

||

(c >= 128)2.86M

)

/* Don't ever change this -- it would break the portability of Python code */

#define TABSIZE 8

/* Forward */

static struct tok_state *tok_new(void);

static int tok_nextc(struct tok_state *tok);

static void tok_backup(struct tok_state *tok, int c);

static int syntaxerror(struct tok_state *tok, const char *format, ...);

/* Spaces in this constant are treated as "zero or more spaces or tabs" when

tokenizing. */

static const char* type_comment_prefix = "# type: ";

/* Create and initialize a new tok_state structure */

static struct tok_state *

tok_new(void)

{

struct tok_state *tok = (struct tok_state *)PyMem_Malloc(

                                        sizeof(struct tok_state));

if (tok == NULL)

Branch (56:9): [True: 0, False: 132k]

    return NULL;

tok->buf = tok->cur = tok->inp = NULL;

tok->fp_interactive = 0;

tok->interactive_src_start = NULL;

tok->interactive_src_end = NULL;

tok->start = NULL;

tok->end = NULL;

tok->done = E_OK;

tok->fp = NULL;

tok->input = NULL;

tok->tabsize = TABSIZE;

tok->indent = 0;

tok->indstack[0] = 0;

tok->atbol = 1;

tok->pendin = 0;

tok->prompt = tok->nextprompt = NULL;

tok->lineno = 0;

tok->level = 0;

tok->altindstack[0] = 0;

tok->decoding_state = STATE_INIT;

tok->decoding_erred = 0;

tok->enc = NULL;

tok->encoding = NULL;

tok->cont_line = 0;

tok->filename = NULL;

tok->decoding_readline = NULL;

tok->decoding_buffer = NULL;

tok->type_comments = 0;

tok->async_hacks = 0;

tok->async_def = 0;

tok->async_def_indent = 0;

tok->async_def_nl = 0;

tok->interactive_underflow = IUNDERFLOW_NORMAL;

tok->str = NULL;

#ifdef Py_DEBUG

tok->debug = _Py_GetConfig()->parser_debug;

#endif

return tok;

}

static char *

new_string(const char *s, Py_ssize_t len, struct tok_state *tok)

{

100

char* result = (char *)PyMem_Malloc(len + 1);

101

if (!result) {

Branch (101:9): [True: 0, False: 57.1k]

102

    tok->done = E_NOMEM;

103

    return NULL;

104

105

memcpy(result, s, len);

106

result[len] = '\0';

107

return result;

}

static char *

error_ret(struct tok_state tok) / XXX */

112

{

113

tok->decoding_erred = 1;

114

if (tok->fp != NULL &&

tok->buf != NULL0

) /* see _PyTokenizer_Free */

Branch (114:9): [True: 0, False: 26] Branch (114:28): [True: 0, False: 0]

115

    PyMem_Free(tok->buf);

116

tok->buf = tok->cur = tok->inp = NULL;

117

tok->start = NULL;

118

tok->end = NULL;

119

tok->done = E_DECODE;

120

return NULL;                /* as if it were EOF */

}

static const char *

get_normal_name(const char s) / for utf-8 and latin-1 */

126

{

127

char buf[13];

128

int i;

129

for (i = 0; i < 12;

i++489

) {

Branch (129:17): [True: 566, False: 0]

130

    int c = s[i];

131

    if (c == '\0')

Branch (131:13): [True: 77, False: 489]

132

        break;

133

    else if (c == '_')

Branch (133:18): [True: 0, False: 489]

134

        buf[i] = '-';

135

    else

136

        buf[i] = tolower(c);

137

138

buf[i] = '\0';

139

if (strcmp(buf, "utf-8") == 0 ||

Branch (139:9): [True: 14, False: 63]

140

    strncmp(buf, "utf-8-", 6) == 063

)

Branch (140:9): [True: 0, False: 63]

141

    return "utf-8";

142

else if (strcmp(buf, "latin-1") == 0 ||

Branch (142:14): [True: 14, False: 49]

143

strcmp(buf, "iso-8859-1") == 049

Branch (143:14): [True: 2, False: 47]

144

strcmp(buf, "iso-latin-1") == 047

Branch (144:14): [True: 0, False: 47]

145

strncmp(buf, "latin-1-", 8) == 047

Branch (145:14): [True: 0, False: 47]

146

strncmp(buf, "iso-8859-1-", 11) == 047

Branch (146:14): [True: 0, False: 47]

147

strncmp(buf, "iso-latin-1-", 12) == 047

)

Branch (147:14): [True: 0, False: 47]

148

    return "iso-8859-1";

149

else

150

    return s;

151

}

152

153

/* Return the coding spec in S, or NULL if none is found. */

154

155

static int

156

get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)

157

{

158

Py_ssize_t i;

159

*spec = NULL;

160

/* Coding spec must be in a comment, and that comment must be

161

 * the only statement on the source code line. */

162

for (i = 0; i < size - 6;

i++0

) {

Branch (162:17): [True: 5.04k, False: 70.9k]

163

    if (s[i] == '#')

Branch (163:13): [True: 336, False: 4.70k]

164

        break;

165

    if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')

Branch (165:13): [True: 4.70k, False: 0] Branch (165:28): [True: 4.70k, False: 0] Branch (165:44): [True: 4.70k, False: 0]

166

        return 1;

167

168

for (; 71.2k

i < size - 6;

i++10.1k

) { /* XXX inefficient search */

Branch (168:12): [True: 10.2k, False: 71.1k]

169

    const char* t = s + i;

170

    if (memcmp(t, "coding", 6) == 0) {

Branch (170:13): [True: 77, False: 10.1k]

171

        const char* begin = NULL;

172

        t += 6;

173

        if (t[0] != ':' &&

t[0] != '='22

)

Branch (173:17): [True: 22, False: 55] Branch (173:32): [True: 0, False: 22]

174

            continue;

175

do 77

{

176

            t++;

177

        } while (t[0] == ' ' ||

t[0] == '\t'77

);

Branch (177:22): [True: 46, False: 77] Branch (177:37): [True: 0, False: 77]

178

179

        begin = t;

180

        while (Py_ISALNUM(t[0]) ||

181

t[0] == '-'133

t[0] == '_'77

t[0] == '.'77

)

Branch (181:20): [True: 56, False: 77] Branch (181:35): [True: 0, False: 77] Branch (181:50): [True: 0, False: 77]

182

            t++;

183

184

        if (begin < t) {

Branch (184:17): [True: 77, False: 0]

185

            char* r = new_string(begin, t - begin, tok);

186

            const char* q;

187

            if (!r)

Branch (187:21): [True: 0, False: 77]

188

                return 0;

189

            q = get_normal_name(r);

190

            if (r != q) {

Branch (190:21): [True: 30, False: 47]

191

                PyMem_Free(r);

192

                r = new_string(q, strlen(q), tok);

193

                if (!r)

Branch (193:25): [True: 0, False: 30]

194

                    return 0;

195

196

            *spec = r;

197

            break;

return 1;

202

}

203

204

/* Check whether the line contains a coding spec. If it does,

205

invoke the set_readline function for the new encoding.

206

This function receives the tok_state and the new encoding.

207

Return 1 on success, 0 on failure. */

208

209

static int

210

check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,

211

              int set_readline(struct tok_state *, const char *))

212

{

213

char *cs;

214

if (tok->cont_line) {

Branch (214:9): [True: 0, False: 75.9k]

215

    /* It's a continuation line, so it can't be a coding spec. */

216

    tok->decoding_state = STATE_NORMAL;

217

    return 1;

218

219

if (!get_coding_spec(line, &cs, size, tok)) {

Branch (219:9): [True: 0, False: 75.9k]

220

    return 0;

221

222

if (!cs) {

Branch (222:9): [True: 75.9k, False: 77]

223

    Py_ssize_t i;

224

    for (i = 0; i < size;

i++0

) {

Branch (224:21): [True: 75.5k, False: 344]

225

        if (line[i] == '#' ||

line[i] == '\n'75.2k

line[i] == '\r'75.1k

)

Branch (225:17): [True: 290, False: 75.2k] Branch (225:35): [True: 78, False: 75.1k] Branch (225:54): [True: 0, False: 75.1k]

226

            break;

227

        if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {

Branch (227:17): [True: 75.1k, False: 0] Branch (227:35): [True: 75.1k, False: 0] Branch (227:54): [True: 75.1k, False: 0]

228

            /* Stop checking coding spec after a line containing

229

             * anything except a comment. */

230

            tok->decoding_state = STATE_NORMAL;

231

            break;

232

233

234

    return 1;

235

236

tok->decoding_state = STATE_NORMAL;

237

if (tok->encoding == NULL) {

Branch (237:9): [True: 51, False: 26]

238

    assert(tok->decoding_readline == NULL);

239

    if (strcmp(cs, "utf-8") != 0 &&

!set_readline(tok, cs)43

) {

Branch (239:13): [True: 43, False: 8] Branch (239:41): [True: 0, False: 43]

240

        error_ret(tok);

241

        PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);

242

        PyMem_Free(cs);

243

        return 0;

244

245

    tok->encoding = cs;

246

} else {                /* then, compare cs with BOM */

247

    if (strcmp(tok->encoding, cs) != 0) {

Branch (247:13): [True: 20, False: 6]

248

        error_ret(tok);

249

        PyErr_Format(PyExc_SyntaxError,

250

                     "encoding problem: %s with BOM", cs);

251

        PyMem_Free(cs);

252

        return 0;

253

254

    PyMem_Free(cs);

255

256

return 1;

257

}

258

259

/* See whether the file starts with a BOM. If it does,

260

invoke the set_readline function with the new encoding.

261

Return 1 on success, 0 on failure. */

262

263

static int

264

check_bom(int get_char(struct tok_state *),

265

      void unget_char(int, struct tok_state *),

266

      int set_readline(struct tok_state *, const char *),

267

      struct tok_state *tok)

268

{

269

int ch1, ch2, ch3;

270

ch1 = get_char(tok);

271

tok->decoding_state = STATE_SEEK_CODING;

272

if (ch1 == EOF) {

Branch (272:9): [True: 0, False: 75.6k]

273

    return 1;

274

} else if (ch1 == 0xEF) {

Branch (274:16): [True: 37, False: 75.6k]

275

    ch2 = get_char(tok);

276

    if (ch2 != 0xBB) {

Branch (276:13): [True: 1, False: 36]

277

        unget_char(ch2, tok);

278

        unget_char(ch1, tok);

279

        return 1;

280

281

    ch3 = get_char(tok);

282

    if (ch3 != 0xBF) {

Branch (282:13): [True: 2, False: 34]

283

        unget_char(ch3, tok);

284

        unget_char(ch2, tok);

285

        unget_char(ch1, tok);

286

        return 1;

287

288

} else {

289

    unget_char(ch1, tok);

290

    return 1;

291

292

if (tok->encoding != NULL)

Branch (292:9): [True: 0, False: 34]

293

    PyMem_Free(tok->encoding);

294

tok->encoding = new_string("utf-8", 5, tok);

295

if (!tok->encoding)

Branch (295:9): [True: 0, False: 34]

296

    return 0;

297

/* No need to set_readline: input is already utf-8 */

298

return 1;

}

static int

tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {

303

assert(tok->fp_interactive);

304

305

if (!line) {

Branch (305:9): [True: 0, False: 0]

306

    return 0;

307

308

309

Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;

310

Py_ssize_t line_size = strlen(line);

311

char last_char = line[line_size > 0 ? line_size - 1 : line_size];

Branch (311:27): [True: 0, False: 0]

312

if (last_char != '\n') {

Branch (312:9): [True: 0, False: 0]

313

    line_size += 1;

314

315

char* new_str = tok->interactive_src_start;

316

317

new_str = PyMem_Realloc(new_str, current_size + line_size + 1);

318

if (!new_str) {

Branch (318:9): [True: 0, False: 0]

319

    if (tok->interactive_src_start) {

Branch (319:13): [True: 0, False: 0]

320

        PyMem_Free(tok->interactive_src_start);

321

322

    tok->interactive_src_start = NULL;

323

    tok->interactive_src_end = NULL;

324

    tok->done = E_NOMEM;

325

    return -1;

326

327

strcpy(new_str + current_size, line);

328

if (last_char != '\n') {

Branch (328:9): [True: 0, False: 0]

329

    /* Last line does not end in \n, fake one */

330

    new_str[current_size + line_size - 1] = '\n';

331

    new_str[current_size + line_size] = '\0';

332

333

tok->interactive_src_start = new_str;

334

tok->interactive_src_end = new_str + current_size + line_size;

335

return 0;

}

/* Read a line of text from TOK into S, using the stream in TOK.

340

Return NULL on failure, else S.

341

342

On entry, tok->decoding_buffer will be one of:

343

 1) NULL: need to call tok->decoding_readline to get a new line

344

 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and

345

   stored the result in tok->decoding_buffer

346

 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room

347

   (in the s buffer) to copy entire contents of the line read

348

   by tok->decoding_readline.  tok->decoding_buffer has the overflow.

349

   In this case, tok_readline_recode is called in a loop (with an expanded buffer)

350

   until the buffer ends with a '\n' (or until the end of the file is

351

   reached): see tok_nextc and its calls to tok_reserve_buf.

static int

tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)

356

{

357

Py_ssize_t cur = tok->cur - tok->buf;

358

Py_ssize_t oldsize = tok->inp - tok->buf;

359

Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);

360

if (newsize > tok->end - tok->buf) {

Branch (360:9): [True: 9, False: 801]

361

    char *newbuf = tok->buf;

362

    Py_ssize_t start = tok->start == NULL ?

-11

tok->start - tok->buf8

;

Branch (362:28): [True: 1, False: 8]

363

    Py_ssize_t line_start = tok->start == NULL ?

-11

tok->line_start - tok->buf8

;

Branch (363:33): [True: 1, False: 8]

364

    Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;

365

    newbuf = (char *)PyMem_Realloc(newbuf, newsize);

366

    if (newbuf == NULL) {

Branch (366:13): [True: 0, False: 9]

367

        tok->done = E_NOMEM;

368

        return 0;

369

370

    tok->buf = newbuf;

371

    tok->cur = tok->buf + cur;

372

    tok->inp = tok->buf + oldsize;

373

    tok->end = tok->buf + newsize;

374

    tok->start = start < 0 ? NULL :

tok->buf + start8

;

Branch (374:22): [True: 1, False: 8]

375

    tok->line_start = line_start < 0 ? NULL :

tok->buf + line_start8

;

Branch (375:27): [True: 1, False: 8]

376

    tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;

Branch (376:33): [True: 0, False: 9]

377

378

return 1;

}

static int

tok_readline_recode(struct tok_state *tok) {

383

PyObject *line;

384

const  char *buf;

385

Py_ssize_t buflen;

386

line = tok->decoding_buffer;

387

if (line == NULL) {

Branch (387:9): [True: 0, False: 0]

388

    line = PyObject_CallNoArgs(tok->decoding_readline);

389

    if (line == NULL) {

Branch (389:13): [True: 0, False: 0]

390

        error_ret(tok);

391

        goto error;

392

393

394

else {

395

    tok->decoding_buffer = NULL;

396

397

buf = PyUnicode_AsUTF8AndSize(line, &buflen);

398

if (buf == NULL) {

Branch (398:9): [True: 0, False: 0]

399

    error_ret(tok);

400

    goto error;

401

402

if (!tok_reserve_buf(tok, buflen + 1)) {

Branch (402:9): [True: 0, False: 0]

403

    goto error;

404

405

memcpy(tok->inp, buf, buflen);

406

tok->inp += buflen;

407

*tok->inp = '\0';

408

if (tok->fp_interactive &&

Branch (408:9): [True: 0, False: 0]

409

    tok_concatenate_interactive_new_line(tok, buf) == -1) {

Branch (409:9): [True: 0, False: 0]

410

    goto error;

411

412

Py_DECREF(line);

413

return 1;

414

error:

415

Py_XDECREF(line);

416

return 0;

417

}

418

419

/* Set the readline function for TOK to a StreamReader's

420

readline function. The StreamReader is named ENC.

421

422

This function is called from check_bom and check_coding_spec.

423

424

ENC is usually identical to the future value of tok->encoding,

425

except for the (currently unsupported) case of UTF-16.

426

427

Return 1 on success, 0 on failure. */

428

429

static int

430

fp_setreadl(struct tok_state tok, const char enc)

431

{

432

PyObject *readline, *open, *stream;

433

int fd;

434

long pos;

435

436

fd = fileno(tok->fp);

437

/* Due to buffering the file offset for fd can be different from the file

438

 * position of tok->fp.  If tok->fp was opened in text mode on Windows,

439

 * its file position counts CRLF as one char and can't be directly mapped

440

 * to the file offset for fd.  Instead we step back one byte and read to

441

 * the end of line.*/

442

pos = ftell(tok->fp);

443

if (pos == -1 ||

Branch (443:9): [True: 0, False: 0]

444

    lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {

Branch (444:9): [True: 0, False: 0] Branch (444:27): [True: 0, False: 0]

445

    PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);

446

    return 0;

447

448

449

open = _PyImport_GetModuleAttrString("io", "open");

450

if (open == NULL) {

Branch (450:9): [True: 0, False: 0]

451

    return 0;

452

453

stream = PyObject_CallFunction(open, "isisOOO",

454

                fd, "r", -1, enc, Py_None, Py_None, Py_False);

455

Py_DECREF(open);

456

if (stream == NULL) {

Branch (456:9): [True: 0, False: 0]

457

    return 0;

458

459

460

readline = PyObject_GetAttr(stream, &_Py_ID(readline));

461

Py_DECREF(stream);

462

if (readline == NULL) {

Branch (462:9): [True: 0, False: 0]

463

    return 0;

464

465

Py_XSETREF(tok->decoding_readline, readline);

466

467

if (pos > 0) {

Branch (467:9): [True: 0, False: 0]

468

    PyObject *bufobj = _PyObject_CallNoArgs(readline);

469

    if (bufobj == NULL) {

Branch (469:13): [True: 0, False: 0]

470

        return 0;

471

472

    Py_DECREF(bufobj);

473

474

475

return 1;

476

}

477

478

/* Fetch the next byte from TOK. */

479

480

static int fp_getc(struct tok_state *tok) {

481

return getc(tok->fp);

482

}

483

484

/* Unfetch the last byte back into TOK. */

485

486

static void fp_ungetc(int c, struct tok_state *tok) {

487

ungetc(c, tok->fp);

488

}

489

490

/* Check whether the characters at s start a valid

491

UTF-8 sequence. Return the number of characters forming

492

the sequence if yes, 0 if not. */

493

static int valid_utf8(const unsigned char* s)

494

{

495

int expected = 0;

496

int length;

497

if (*s < 0x80)

Branch (497:9): [True: 18.2k, False: 0]

498

    /* single-byte code */

499

    return 1;

500

if (*s < 0xc0)

Branch (500:9): [True: 0, False: 0]

501

    /* following byte */

502

    return 0;

503

if (*s < 0xE0)

Branch (503:9): [True: 0, False: 0]

504

    expected = 1;

505

else if (*s < 0xF0)

Branch (505:14): [True: 0, False: 0]

506

    expected = 2;

507

else if (*s < 0xF8)

Branch (507:14): [True: 0, False: 0]

508

    expected = 3;

509

else

510

    return 0;

511

length = expected + 1;

512

for (; expected; expected--)

Branch (512:12): [True: 0, False: 0]

513

    if (s[expected] < 0x80 || s[expected] >= 0xC0)

Branch (513:13): [True: 0, False: 0] Branch (513:35): [True: 0, False: 0]

514

        return 0;

515

return length;

}

static int

ensure_utf8(char *line, struct tok_state *tok)

520

{

521

int badchar = 0;

522

unsigned char *c;

523

int length;

524

for (c = (unsigned char *)line; *c;

c += length18.2k

) {

Branch (524:37): [True: 18.2k, False: 806]

525

    if (!(length = valid_utf8(c))) {

Branch (525:13): [True: 0, False: 18.2k]

526

        badchar = *c;

527

        break;

528

529

530

if (badchar) {

Branch (530:9): [True: 0, False: 806]

531

    /* Need to add 1 to the line number, since this line

532

   has not been counted, yet.  */

533

    PyErr_Format(PyExc_SyntaxError,

534

                 "Non-UTF-8 code starting with '\\x%.2x' "

535

                 "in file %U on line %i, "

536

                 "but no encoding declared; "

537

                 "see https://peps.python.org/pep-0263/ for details",

538

                 badchar, tok->filename, tok->lineno + 1);

539

    return 0;

540

541

return 1;

542

}

543

544

/* Fetch a byte from TOK, using the string buffer. */

545

546

static int

547

buf_getc(struct tok_state *tok) {

548

return Py_CHARMASK(*tok->str++);

549

}

550

551

/* Unfetch a byte from TOK, using the string buffer. */

552

553

static void

554

buf_ungetc(int c, struct tok_state *tok) {

555

tok->str--;

556

assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */

557

}

558

559

/* Set the readline function for TOK to ENC. For the string-based

560

tokenizer, this means to just record the encoding. */

561

562

static int

563

buf_setreadl(struct tok_state tok, const char enc) {

564

tok->enc = enc;

565

return 1;

566

}

567

568

/* Return a UTF-8 encoding Python string object from the

569

C byte string STR, which is encoded with ENC. */

570

571

static PyObject *

572

translate_into_utf8(const char* str, const char* enc) {

573

PyObject *utf8;

574

PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);

575

if (buf == NULL)

Branch (575:9): [True: 6, False: 37]

576

    return NULL;

577

utf8 = PyUnicode_AsUTF8String(buf);

578

Py_DECREF(buf);

579

return utf8;

}

static char *

translate_newlines(const char *s, int exec_input, struct tok_state *tok) {

585

int skip_next_lf = 0;

586

size_t needed_length = strlen(s) + 2, final_length;

587

char *buf, *current;

588

char c = '\0';

589

buf = PyMem_Malloc(needed_length);

590

if (buf == NULL) {

Branch (590:9): [True: 0, False: 132k]

591

    tok->done = E_NOMEM;

592

    return NULL;

593

594

for (current = buf; 132k

*s;

s++, current++27.2M

) {

Branch (594:25): [True: 27.2M, False: 132k]

595

    c = *s;

596

    if (skip_next_lf) {

Branch (596:13): [True: 40, False: 27.2M]

597

        skip_next_lf = 0;

598

        if (c == '\n') {

Branch (598:17): [True: 23, False: 17]

599

            c = *++s;

600

            if (!c)

Branch (600:21): [True: 7, False: 16]

601

                break;

602

603

604

    if (c == '\r') {

Branch (604:13): [True: 47, False: 27.2M]

605

        skip_next_lf = 1;

606

        c = '\n';

607

608

    *current = c;

609

610

/* If this is exec input, add a newline to the end of the string if

611

   there isn't one already. */

612

if (exec_input &&

c != '\n'90.7k

) {

Branch (612:9): [True: 90.7k, False: 41.6k] Branch (612:23): [True: 86.4k, False: 4.29k]

613

    *current = '\n';

614

    current++;

615

616

*current = '\0';

617

final_length = current - buf + 1;

618

if (final_length < needed_length &&

final_length45.9k

) {

Branch (618:9): [True: 45.9k, False: 86.4k] Branch (618:41): [True: 45.9k, False: 0]

619

    /* should never fail */

620

    char* result = PyMem_Realloc(buf, final_length);

621

    if (result == NULL) {

Branch (621:13): [True: 0, False: 45.9k]

622

        PyMem_Free(buf);

623

624

    buf = result;

625

626

return buf;

627

}

628

629

/* Decode a byte string STR for use as the buffer of TOK.

630

Look for encoding declarations inside STR, and record them

inside TOK. */

static char *

decode_str(const char *input, int single, struct tok_state *tok)

635

{

636

PyObject* utf8 = NULL;

637

char *str;

638

const char *s;

639

const char *newl[2] = {NULL, NULL};

640

int lineno = 0;

641

tok->input = str = translate_newlines(input, single, tok);

642

if (str == NULL)

Branch (642:9): [True: 0, False: 75.4k]

643

    return NULL;

644

tok->enc = NULL;

645

tok->str = str;

646

if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))

Branch (646:9): [True: 0, False: 75.4k]

647

    return error_ret(tok);

648

str = tok->str;             /* string after BOM if any */

649

assert(str);

650

if (tok->enc != NULL) {

Branch (650:9): [True: 0, False: 75.4k]

651

    utf8 = translate_into_utf8(str, tok->enc);

652

    if (utf8 == NULL)

Branch (652:13): [True: 0, False: 0]

653

        return error_ret(tok);

654

    str = PyBytes_AsString(utf8);

655

656

for (s = str;; 75.4k

s++846k

) {

657

    if (*s == '\0')

break74.2k

;

Branch (657:13): [True: 74.2k, False: 847k]

658

    else if (*s == '\n') {

Branch (658:18): [True: 76.5k, False: 770k]

659

        assert(lineno < 2);

660

        newl[lineno] = s;

661

        lineno++;

662

        if (lineno == 2)

break1.18k

;

Branch (662:17): [True: 1.18k, False: 75.4k]

663

664

665

tok->enc = NULL;

666

/* need to check line 1 and 2 separately since check_coding_spec

667

   assumes a single line as input */

668

if (newl[0]) {

Branch (668:9): [True: 75.4k, False: 13]

669

    if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {

Branch (669:13): [True: 20, False: 75.3k]

670

        return NULL;

671

672

    if (tok->enc == NULL &&

tok->decoding_state != STATE_NORMAL75.3k

newl[1]481

) {

Branch (672:13): [True: 75.3k, False: 36] Branch (672:33): [True: 481, False: 74.8k] Branch (672:72): [True: 224, False: 257]

673

        if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],

Branch (673:17): [True: 0, False: 224]

674

                               tok, buf_setreadl))

675

            return NULL;

676

677

678

if (tok->enc != NULL) {

Branch (678:9): [True: 43, False: 75.3k]

679

    assert(utf8 == NULL);

680

    utf8 = translate_into_utf8(str, tok->enc);

681

    if (utf8 == NULL)

Branch (681:13): [True: 6, False: 37]

682

        return error_ret(tok);

683

    str = PyBytes_AS_STRING(utf8);

684

685

assert(tok->decoding_buffer == NULL);

686

tok->decoding_buffer = utf8; /* CAUTION */

687

return str;

688

}

689

690

/* Set up tokenizer for string */

691

692

struct tok_state *

693

_PyTokenizer_FromString(const char *str, int exec_input)

694

{

695

struct tok_state *tok = tok_new();

696

char *decoded;

697

698

if (tok == NULL)

Branch (698:9): [True: 0, False: 75.4k]

699

    return NULL;

700

decoded = decode_str(str, exec_input, tok);

701

if (decoded == NULL) {

Branch (701:9): [True: 26, False: 75.4k]

702

    _PyTokenizer_Free(tok);

703

    return NULL;

704

705

706

tok->buf = tok->cur = tok->inp = decoded;

707

tok->end = decoded;

708

return tok;

709

}

710

711

/* Set up tokenizer for UTF-8 string */

712

713

struct tok_state *

714

_PyTokenizer_FromUTF8(const char *str, int exec_input)

715

{

716

struct tok_state *tok = tok_new();

717

char *translated;

718

if (tok == NULL)

Branch (718:9): [True: 0, False: 56.9k]

719

    return NULL;

720

tok->input = translated = translate_newlines(str, exec_input, tok);

721

if (translated == NULL) {

Branch (721:9): [True: 0, False: 56.9k]

722

    _PyTokenizer_Free(tok);

723

    return NULL;

724

725

tok->decoding_state = STATE_NORMAL;

726

tok->enc = NULL;

727

tok->str = translated;

728

tok->encoding = new_string("utf-8", 5, tok);

729

if (!tok->encoding) {

Branch (729:9): [True: 0, False: 56.9k]

730

    _PyTokenizer_Free(tok);

731

    return NULL;

732

733

734

tok->buf = tok->cur = tok->inp = translated;

735

tok->end = translated;

736

return tok;

737

}

738

739

/* Set up tokenizer for file */

740

741

struct tok_state *

742

_PyTokenizer_FromFile(FILE fp, const char enc,

743

                  const char *ps1, const char *ps2)

744

{

745

struct tok_state *tok = tok_new();

746

if (tok == NULL)

Branch (746:9): [True: 0, False: 257]

747

    return NULL;

748

if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {

Branch (748:9): [True: 0, False: 257]

749

    _PyTokenizer_Free(tok);

750

    return NULL;

751

752

tok->cur = tok->inp = tok->buf;

753

tok->end = tok->buf + BUFSIZ;

754

tok->fp = fp;

755

tok->prompt = ps1;

756

tok->nextprompt = ps2;

757

if (enc != NULL) {

Branch (757:9): [True: 0, False: 257]

758

    /* Must copy encoding declaration since it

759

       gets copied into the parse tree. */

760

    tok->encoding = new_string(enc, strlen(enc), tok);

761

    if (!tok->encoding) {

Branch (761:13): [True: 0, False: 0]

762

        _PyTokenizer_Free(tok);

763

        return NULL;

764

765

    tok->decoding_state = STATE_NORMAL;

766

767

return tok;

768

}

769

770

/* Free a tok_state structure */

771

772

void

773

_PyTokenizer_Free(struct tok_state *tok)

774

{

775

if (tok->encoding != NULL) {

Branch (775:9): [True: 57.0k, False: 75.5k]

776

    PyMem_Free(tok->encoding);

777

778

Py_XDECREF(tok->decoding_readline);

779

Py_XDECREF(tok->decoding_buffer);

780

Py_XDECREF(tok->filename);

781

if (tok->fp != NULL &&

tok->buf != NULL257

) {

Branch (781:9): [True: 257, False: 132k] Branch (781:28): [True: 257, False: 0]

782

    PyMem_Free(tok->buf);

783

784

if (tok->input) {

Branch (784:9): [True: 132k, False: 257]

785

    PyMem_Free(tok->input);

786

787

if (tok->interactive_src_start != NULL) {

Branch (787:9): [True: 0, False: 132k]

788

    PyMem_Free(tok->interactive_src_start);

789

790

PyMem_Free(tok);

}

static int

tok_readline_raw(struct tok_state *tok)

795

{

796

do {

797

    if (!tok_reserve_buf(tok, BUFSIZ)) {

Branch (797:13): [True: 0, False: 810]

798

        return 0;

799

800

    char *line = Py_UniversalNewlineFgets(tok->inp,

801

                                          (int)(tok->end - tok->inp),

802

                                          tok->fp, NULL);

803

    if (line == NULL) {

Branch (803:13): [True: 4, False: 806]

804

        return 1;

805

806

    if (tok->fp_interactive &&

Branch (806:13): [True: 0, False: 806]

807

tok_concatenate_interactive_new_line(tok, line) == -10

) {

Branch (807:13): [True: 0, False: 0]

808

        return 0;

809

810

    tok->inp = strchr(tok->inp, '\0');

811

    if (tok->inp == tok->buf) {

Branch (811:13): [True: 0, False: 806]

812

        return 0;

813

814

} while (tok->inp[-1] != '\n');

Branch (814:14): [True: 1, False: 805]

815

return 1;

}

static int

tok_underflow_string(struct tok_state *tok) {

820

char *end = strchr(tok->inp, '\n');

821

if (end != NULL) {

Branch (821:9): [True: 3.37M, False: 92.9k]

822

    end++;

823

824

else {

825

    end = strchr(tok->inp, '\0');

826

    if (end == tok->inp) {

Branch (826:13): [True: 55.7k, False: 37.1k]

827

        tok->done = E_EOF;

828

        return 0;

829

830

831

if (tok->start == NULL) {

Branch (831:9): [True: 3.34M, False: 64.1k]

832

    tok->buf = tok->cur;

833

834

tok->line_start = tok->cur;

835

tok->lineno++;

836

tok->inp = end;

837

return 1;

}

static int

tok_underflow_interactive(struct tok_state *tok) {

842

if (tok->interactive_underflow == IUNDERFLOW_STOP) {

Branch (842:9): [True: 0, False: 0]

843

    tok->done = E_INTERACT_STOP;

844

    return 1;

845

846

char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);

Branch (846:34): [True: 0, False: 0]

847

if (newtok != NULL) {

Branch (847:9): [True: 0, False: 0]

848

    char *translated = translate_newlines(newtok, 0, tok);

849

    PyMem_Free(newtok);

850

    if (translated == NULL) {

Branch (850:13): [True: 0, False: 0]

851

        return 0;

852

853

    newtok = translated;

854

855

if (tok->encoding && newtok && *newtok) {

Branch (855:9): [True: 0, False: 0] Branch (855:26): [True: 0, False: 0] Branch (855:36): [True: 0, False: 0]

856

    /* Recode to UTF-8 */

857

    Py_ssize_t buflen;

858

    const char* buf;

859

    PyObject *u = translate_into_utf8(newtok, tok->encoding);

860

    PyMem_Free(newtok);

861

    if (u == NULL) {

Branch (861:13): [True: 0, False: 0]

862

        tok->done = E_DECODE;

863

        return 0;

864

865

    buflen = PyBytes_GET_SIZE(u);

866

    buf = PyBytes_AS_STRING(u);

867

    newtok = PyMem_Malloc(buflen+1);

868

    if (newtok == NULL) {

Branch (868:13): [True: 0, False: 0]

869

        Py_DECREF(u);

870

        tok->done = E_NOMEM;

871

        return 0;

872

873

    strcpy(newtok, buf);

874

    Py_DECREF(u);

875

876

if (tok->fp_interactive &&

Branch (876:9): [True: 0, False: 0]

877

    tok_concatenate_interactive_new_line(tok, newtok) == -1) {

Branch (877:9): [True: 0, False: 0]

878

    PyMem_Free(newtok);

879

    return 0;

880

881

if (tok->nextprompt != NULL) {

Branch (881:9): [True: 0, False: 0]

882

    tok->prompt = tok->nextprompt;

883

884

if (newtok == NULL) {

Branch (884:9): [True: 0, False: 0]

885

    tok->done = E_INTR;

886

887

else if (*newtok == '\0') {

Branch (887:14): [True: 0, False: 0]

888

    PyMem_Free(newtok);

889

    tok->done = E_EOF;

890

891

else if (tok->start != NULL) {

Branch (891:14): [True: 0, False: 0]

892

    Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;

893

    size_t size = strlen(newtok);

894

    tok->lineno++;

895

    if (!tok_reserve_buf(tok, size + 1)) {

Branch (895:13): [True: 0, False: 0]

896

        PyMem_Free(tok->buf);

897

        tok->buf = NULL;

898

        PyMem_Free(newtok);

899

        return 0;

900

901

    memcpy(tok->cur, newtok, size + 1);

902

    PyMem_Free(newtok);

903

    tok->inp += size;

904

    tok->multi_line_start = tok->buf + cur_multi_line_start;

905

906

else {

907

    tok->lineno++;

908

    PyMem_Free(tok->buf);

909

    tok->buf = newtok;

910

    tok->cur = tok->buf;

911

    tok->line_start = tok->buf;

912

    tok->inp = strchr(tok->buf, '\0');

913

    tok->end = tok->inp + 1;

914

915

if (tok->done != E_OK) {

Branch (915:9): [True: 0, False: 0]

916

    if (tok->prompt != NULL) {

Branch (916:13): [True: 0, False: 0]

917

        PySys_WriteStderr("\n");

918

919

    return 0;

920

921

return 1;

}

static int

tok_underflow_file(struct tok_state *tok) {

926

if (tok->start == NULL) {

Branch (926:9): [True: 801, False: 8]

927

    tok->cur = tok->inp = tok->buf;

928

929

if (tok->decoding_state == STATE_INIT) {

Branch (929:9): [True: 257, False: 552]

930

    /* We have not yet determined the encoding.

931

       If an encoding is found, use the file-pointer

932

       reader functions from now on. */

933

    if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {

Branch (933:13): [True: 0, False: 257]

934

        error_ret(tok);

935

        return 0;

936

937

    assert(tok->decoding_state != STATE_INIT);

938

939

/* Read until '\n' or EOF */

940

if (tok->decoding_readline != NULL) {

Branch (940:9): [True: 0, False: 809]

941

    /* We already have a codec associated with this input. */

942

    if (!tok_readline_recode(tok)) {

Branch (942:13): [True: 0, False: 0]

943

        return 0;

944

945

946

else {

947

    /* We want a 'raw' read. */

948

    if (!tok_readline_raw(tok)) {

Branch (948:13): [True: 0, False: 809]

949

        return 0;

950

951

952

if (tok->inp == tok->cur) {

Branch (952:9): [True: 3, False: 806]

953

    tok->done = E_EOF;

954

    return 0;

955

956

if (tok->inp[-1] != '\n') {

Branch (956:9): [True: 1, False: 805]

957

    /* Last line does not end in \n, fake one */

958

    *tok->inp++ = '\n';

959

    *tok->inp = '\0';

960

961

962

tok->lineno++;

963

if (tok->decoding_state != STATE_NORMAL) {

Branch (963:9): [True: 425, False: 381]

964

    if (tok->lineno > 2) {

Branch (964:13): [True: 84, False: 341]

965

        tok->decoding_state = STATE_NORMAL;

966

967

    else if (!check_coding_spec(tok->cur, strlen(tok->cur),

Branch (967:18): [True: 0, False: 341]

968

                                tok, fp_setreadl))

969

970

        return 0;

971

972

973

/* The default encoding is UTF-8, so make sure we don't have any

974

   non-UTF-8 sequences in it. */

975

if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {

Branch (975:9): [True: 806, False: 0] Branch (975:27): [True: 0, False: 806]

976

    error_ret(tok);

977

    return 0;

978

979

assert(tok->done == E_OK);

980

return tok->done == E_OK;

981

}

982

983

#if defined(Py_DEBUG)

984

static void

985

print_escape(FILE *f, const char *s, Py_ssize_t size)

986

{

987

if (s == NULL) {

988

    fputs("NULL", f);

989

    return;

990

991

putc('"', f);

992

while (size-- > 0) {

993

    unsigned char c = *s++;

994

    switch (c) {

995

        case '\n': fputs("\\n", f); break;

996

        case '\r': fputs("\\r", f); break;

997

        case '\t': fputs("\\t", f); break;

998

        case '\f': fputs("\\f", f); break;

999

        case '\'': fputs("\\'", f); break;

1000

        case '"': fputs("\\\"", f); break;

1001

        default:

1002

            if (0x20 <= c && c <= 0x7f)

1003

                putc(c, f);

1004

            else

1005

                fprintf(f, "\\x%02x", c);

1006

1007

1008

putc('"', f);

}

#endif

/* Get next char, updating state; error code goes into tok->done */

1013

1014

static int

1015

tok_nextc(struct tok_state *tok)

1016

{

1017

int rc;

1018

for (;;) {

1019

    if (tok->cur != tok->inp) {

Branch (1019:13): [True: 47.0M, False: 3.57M]

1020

        return Py_CHARMASK(*tok->cur++); /* Fast path */

1021

1022

    if (tok->done != E_OK) {

Branch (1022:13): [True: 107k, False: 3.46M]

1023

       return EOF;

1024

1025

    if (tok->fp == NULL) {

Branch (1025:13): [True: 3.46M, False: 809]

1026

        rc = tok_underflow_string(tok);

1027

1028

    else if (tok->prompt != NULL) {

Branch (1028:18): [True: 0, False: 809]

1029

        rc = tok_underflow_interactive(tok);

1030

1031

    else {

1032

        rc = tok_underflow_file(tok);

1033

1034

#if defined(Py_DEBUG)

1035

    if (tok->debug) {

1036

        fprintf(stderr, "line[%d] = ", tok->lineno);

1037

        print_escape(stderr, tok->cur, tok->inp - tok->cur);

1038

        fprintf(stderr, "  tok->done = %d\n", tok->done);

1039

1040

#endif

1041

    if (!rc) {

Branch (1041:13): [True: 55.7k, False: 3.41M]

1042

        tok->cur = tok->inp;

1043

        return EOF;

1044

1045

    tok->line_start = tok->cur;

1046

1047

Py_UNREACHABLE0

();

1048

}

1049

1050

/* Back-up one character */

1051

1052

static void

1053

tok_backup(struct tok_state *tok, int c)

1054

{

1055

if (c != EOF) {

Branch (1055:9): [True: 19.9M, False: 107k]

1056

    if (--tok->cur < tok->buf) {

Branch (1056:13): [True: 0, False: 19.9M]

1057

        Py_FatalError("tokenizer beginning of buffer");

1058

1059

    if ((int)(unsigned char)*tok->cur != c) {

Branch (1059:13): [True: 0, False: 19.9M]

1060

        Py_FatalError("tok_backup: wrong character");

}

static int

_syntaxerror_range(struct tok_state *tok, const char *format,

1067

               int col_offset, int end_col_offset,

1068

               va_list vargs)

1069

{

1070

PyObject *errmsg, *errtext, *args;

1071

errmsg = PyUnicode_FromFormatV(format, vargs);

1072

if (!errmsg) {

Branch (1072:9): [True: 0, False: 228]

1073

    goto error;

1074

1075

1076

errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,

1077

                               "replace");

1078

if (!errtext) {

Branch (1078:9): [True: 0, False: 228]

1079

    goto error;

1080

1081

1082

if (col_offset == -1) {

Branch (1082:9): [True: 215, False: 13]

1083

    col_offset = (int)PyUnicode_GET_LENGTH(errtext);

1084

1085

if (end_col_offset == -1) {

Branch (1085:9): [True: 215, False: 13]

1086

    end_col_offset = col_offset;

1087

1088

1089

Py_ssize_t line_len = strcspn(tok->line_start, "\n");

1090

if (line_len != tok->cur - tok->line_start) {

Branch (1090:9): [True: 153, False: 75]

1091

    Py_DECREF(errtext);

1092

    errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,

1093

                                   "replace");

1094

1095

if (!errtext) {

Branch (1095:9): [True: 0, False: 228]

1096

    goto error;

1097

1098

1099

args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,

1100

                     col_offset, errtext, tok->lineno, end_col_offset);

1101

if (args) {

Branch (1101:9): [True: 228, False: 0]

1102

    PyErr_SetObject(PyExc_SyntaxError, args);

1103

    Py_DECREF(args);

error:

Py_XDECREF(errmsg);

1108

tok->done = E_ERROR;

1109

return ERRORTOKEN;

}

static int

syntaxerror(struct tok_state *tok, const char *format, ...)

1114

{

1115

va_list vargs;

1116

va_start(vargs, format);

1117

int ret = _syntaxerror_range(tok, format, -1, -1, vargs);

1118

va_end(vargs);

1119

return ret;

}

static int

syntaxerror_known_range(struct tok_state *tok,

1124

                    int col_offset, int end_col_offset,

1125

                    const char *format, ...)

1126

{

1127

va_list vargs;

1128

va_start(vargs, format);

1129

int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);

1130

va_end(vargs);

1131

return ret;

}

static int

indenterror(struct tok_state *tok)

1138

{

1139

tok->done = E_TABSPACE;

1140

tok->cur = tok->inp;

1141

return ERRORTOKEN;

}

static int

parser_warn(struct tok_state *tok, PyObject *category, const char *format, ...)

1146

{

1147

PyObject *errmsg;

1148

va_list vargs;

1149

va_start(vargs, format);

1150

errmsg = PyUnicode_FromFormatV(format, vargs);

1151

va_end(vargs);

1152

if (!errmsg) {

Branch (1152:9): [True: 0, False: 130]

1153

    goto error;

1154

1155

1156

if (PyErr_WarnExplicitObject(category, errmsg, tok->filename,

Branch (1156:9): [True: 64, False: 66]

1157

                             tok->lineno, NULL, NULL) < 0) {

1158

    if (PyErr_ExceptionMatches(category)) {

Branch (1158:13): [True: 64, False: 0]

1159

        /* Replace the DeprecationWarning exception with a SyntaxError

1160

           to get a more accurate error report */

1161

        PyErr_Clear();

1162

        syntaxerror(tok, "%U", errmsg);

1163

1164

    goto error;

1165

1166

Py_DECREF(errmsg);

1167

return 0;

1168

1169

error:

1170

Py_XDECREF(errmsg);

1171

tok->done = E_ERROR;

1172

return -1;

}

static int

lookahead(struct tok_state *tok, const char *test)

1177

{

1178

const char *s = test;

1179

int res = 0;

1180

while (1) {

Branch (1180:12): [Folded - Ignored]

1181

    int c = tok_nextc(tok);

1182

    if (*s == 0) {

Branch (1182:13): [True: 82, False: 167]

1183

        res = !is_potential_identifier_char(c);

1184

1185

    else if (c == *s) {

Branch (1185:18): [True: 158, False: 9]

1186

        s++;

1187

        continue;

1188

1189

1190

    tok_backup(tok, c);

1191

    while (s != test) {

Branch (1191:16): [True: 158, False: 91]

1192

        tok_backup(tok, *--s);

1193

1194

    return res;

}

static int

verify_end_of_number(struct tok_state *tok, int c, const char *kind)

1200

{

1201

/* Emit a deprecation warning only if the numeric literal is immediately

1202

 * followed by one of keywords which can occur after a numeric literal

1203

 * in valid code: "and", "else", "for", "if", "in", "is" and "or".

1204

 * It allows to gradually deprecate existing valid code without adding

1205

 * warning before error in most cases of invalid numeric literal (which

1206

 * would be confusing and break existing tests).

1207

 * Raise a syntax error with slightly better message than plain

1208

 * "invalid syntax" if the numeric literal is immediately followed by

1209

 * other keyword or identifier.

1210

*/

1211

int r = 0;

1212

if (c == 'a') {

Branch (1212:9): [True: 14, False: 646k]

1213

    r = lookahead(tok, "nd");

1214

1215

else if (c == 'e') {

Branch (1215:14): [True: 24, False: 646k]

1216

    r = lookahead(tok, "lse");

1217

1218

else if (c == 'f') {

Branch (1218:14): [True: 14, False: 646k]

1219

    r = lookahead(tok, "or");

1220

1221

else if (c == 'i') {

Branch (1221:14): [True: 48, False: 646k]

1222

    int c2 = tok_nextc(tok);

1223

    if (c2 == 'f' ||

c2 == 'n'32

c2 == 's'16

) {

Branch (1223:13): [True: 16, False: 32] Branch (1223:26): [True: 16, False: 16] Branch (1223:39): [True: 16, False: 0]

1224

        r = 1;

1225

1226

    tok_backup(tok, c2);

1227

1228

else if (c == 'o') {

Branch (1228:14): [True: 22, False: 646k]

1229

    r = lookahead(tok, "r");

1230

1231

else if (c == 'n') {

Branch (1231:14): [True: 17, False: 646k]

1232

    r = lookahead(tok, "ot");

1233

1234

if (r) {

Branch (1234:9): [True: 130, False: 646k]

1235

    tok_backup(tok, c);

1236

    if (parser_warn(tok, PyExc_SyntaxWarning,

Branch (1236:13): [True: 64, False: 66]

1237

            "invalid %s literal", kind))

1238

1239

        return 0;

1240

1241

    tok_nextc(tok);

1242

1243

else /* In future releases, only error will remain. */

1244

if (is_potential_identifier_char(c)) {

1245

    tok_backup(tok, c);

1246

    syntaxerror(tok, "invalid %s literal", kind);

1247

    return 0;

1248

1249

return 1;

1250

}

1251

1252

/* Verify that the identifier follows PEP 3131.

1253

All identifier strings are guaranteed to be "ready" unicode objects.

1254

1255

static int

1256

verify_identifier(struct tok_state *tok)

1257

{

1258

PyObject *s;

1259

if (tok->decoding_erred)

Branch (1259:9): [True: 0, False: 48]

1260

    return 0;

1261

s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);

1262

if (s == NULL) {

Branch (1262:9): [True: 4, False: 44]

1263

    if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {

Branch (1263:13): [True: 4, False: 0]

1264

        tok->done = E_DECODE;

1265

1266

    else {

1267

        tok->done = E_ERROR;

1268

1269

    return 0;

1270

1271

Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);

1272

if (invalid < 0) {

Branch (1272:9): [True: 0, False: 44]

1273

    Py_DECREF(s);

1274

    tok->done = E_ERROR;

1275

    return 0;

1276

1277

assert(PyUnicode_GET_LENGTH(s) > 0);

1278

if (invalid < PyUnicode_GET_LENGTH(s)) {

Branch (1278:9): [True: 8, False: 36]

1279

    Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);

1280

    if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {

Branch (1280:13): [True: 0, False: 8]

1281

        /* Determine the offset in UTF-8 encoded input */

1282

        Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));

1283

        if (s != NULL) {

Branch (1283:17): [True: 0, False: 0]

1284

            Py_SETREF(s, PyUnicode_AsUTF8String(s));

1285

1286

        if (s == NULL) {

Branch (1286:17): [True: 0, False: 0]

1287

            tok->done = E_ERROR;

1288

            return 0;

1289

1290

        tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);

1291

1292

    Py_DECREF(s);

1293

    // PyUnicode_FromFormatV() does not support %X

1294

    char hex[9];

1295

    (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);

1296

    if (Py_UNICODE_ISPRINTABLE(ch)) {

1297

        syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);

1298

1299

    else {

1300

        syntaxerror(tok, "invalid non-printable character U+%s", hex);

1301

1302

    return 0;

1303

1304

Py_DECREF(s);

1305

return 1;

}

static int

tok_decimal_tail(struct tok_state *tok)

1310

{

1311

int c;

1312

1313

while (1) {

Branch (1313:12): [Folded - Ignored]

1314

    do {

1315

        c = tok_nextc(tok);

1316

    } while (isdigit(c));

1317

    if (c != '_') {

Branch (1317:13): [True: 330k, False: 262]

1318

        break;

1319

1320

    c = tok_nextc(tok);

1321

    if (!isdigit(c)) {

Branch (1321:13): [True: 27, False: 235]

1322

        tok_backup(tok, c);

1323

        syntaxerror(tok, "invalid decimal literal");

1324

        return 0;

1325

1326

1327

return c;

1328

}

1329

1330

/* Get next token, after space stripping etc. */

1331

1332

static inline int

1333

tok_continuation_line(struct tok_state *tok) {

1334

int c = tok_nextc(tok);

1335

if (c != '\n') {

Branch (1335:9): [True: 7, False: 486]

1336

    tok->done = E_LINECONT;

1337

    return -1;

1338

1339

c = tok_nextc(tok);

1340

if (c == EOF) {

Branch (1340:9): [True: 9, False: 477]

1341

    tok->done = E_EOF;

1342

    tok->cur = tok->inp;

1343

    return -1;

1344

} else {

1345

    tok_backup(tok, c);

1346

1347

return c;

}

static int

tok_get(struct tok_state *tok, const char **p_start, const char **p_end)

1352

{

1353

int c;

1354

int blankline, nonascii;

1355

1356

*p_start = *p_end = NULL;

1357

nextline:

1358

tok->start = NULL;

1359

blankline = 0;

1360

1361

/* Get indentation level */

1362

if (tok->atbol) {

Branch (1362:9): [True: 3.36M, False: 6.82M]

1363

    int col = 0;

1364

    int altcol = 0;

1365

    tok->atbol = 0;

1366

    int cont_line_col = 0;

1367

    for (;;) {

1368

        c = tok_nextc(tok);

1369

        if (c == ' ') {

Branch (1369:17): [True: 4.90M, False: 3.36M]

1370

            col++, altcol++;

1371

1372

        else if (c == '\t') {

Branch (1372:22): [True: 907, False: 3.36M]

1373

            col = (col / tok->tabsize + 1) * tok->tabsize;

1374

            altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;

1375

1376

        else if (c == '\014')  {/* Control-L (formfeed) */

Branch (1376:22): [True: 7, False: 3.36M]

1377

            col = altcol = 0; /* For Emacs users */

1378

1379

        else if (c == '\\') {

Branch (1379:22): [True: 30, False: 3.36M]

1380

            // Indentation cannot be split over multiple physical lines

1381

            // using backslashes. This means that if we found a backslash

1382

            // preceded by whitespace, **the first one we find** determines

1383

            // the level of indentation of whatever comes next.

1384

            cont_line_col = cont_line_col ?

cont_line_col6

col24

;

Branch (1384:33): [True: 6, False: 24]

1385

            if ((c = tok_continuation_line(tok)) == -1) {

Branch (1385:21): [True: 1, False: 29]

1386

                return ERRORTOKEN;

1387

1388

1389

        else {

1390

            break;

1391

1392

1393

    tok_backup(tok, c);

1394

    if (c == '#' ||

c == '\n'3.33M

) {

Branch (1394:13): [True: 33.8k, False: 3.33M] Branch (1394:25): [True: 2.62M, False: 707k]

1395

        /* Lines with only whitespace and/or comments

1396

           shouldn't affect the indentation and are

1397

           not passed to the parser as NEWLINE tokens,

1398

           except *totally* empty lines in interactive

1399

           mode, which signal the end of a command group. */

1400

        if (col == 0 &&

c == '\n'61.5k

tok->prompt != NULL49.4k

) {

Branch (1400:17): [True: 61.5k, False: 2.59M] Branch (1400:29): [True: 49.4k, False: 12.1k] Branch (1400:42): [True: 0, False: 49.4k]

1401

            blankline = 0; /* Let it through */

1402

1403

        else if (tok->prompt != NULL &&

tok->lineno == 10

) {

Branch (1403:22): [True: 0, False: 2.65M] Branch (1403:45): [True: 0, False: 0]

1404

            /* In interactive mode, if the first line contains

1405

               only spaces and/or a comment, let it through. */

1406

            blankline = 0;

1407

            col = altcol = 0;

1408

1409

        else {

1410

            blankline = 1; /* Ignore completely */

1411

1412

        /* We can't jump back right here since we still

1413

           may need to skip to the end of a comment */

1414

1415

    if (!blankline &&

tok->level == 0707k

) {

Branch (1415:13): [True: 707k, False: 2.65M] Branch (1415:27): [True: 593k, False: 114k]

1416

        col = cont_line_col ?

cont_line_col10

col593k

;

Branch (1416:19): [True: 10, False: 593k]

1417

        altcol = cont_line_col ?

cont_line_col10

altcol593k

;

Branch (1417:22): [True: 10, False: 593k]

1418

        if (col == tok->indstack[tok->indent]) {

Branch (1418:17): [True: 445k, False: 147k]

1419

            /* No change */

1420

            if (altcol != tok->altindstack[tok->indent]) {

Branch (1420:21): [True: 2, False: 445k]

1421

                return indenterror(tok);

1422

1423

1424

        else if (col > tok->indstack[tok->indent]) {

Branch (1424:22): [True: 81.6k, False: 65.7k]

1425

            /* Indent -- always one */

1426

            if (tok->indent+1 >= MAXINDENT) {

Branch (1426:21): [True: 0, False: 81.6k]

1427

                tok->done = E_TOODEEP;

1428

                tok->cur = tok->inp;

1429

                return ERRORTOKEN;

1430

1431

            if (altcol <= tok->altindstack[tok->indent]) {

Branch (1431:21): [True: 0, False: 81.6k]

1432

                return indenterror(tok);

1433

1434

            tok->pendin++;

1435

            tok->indstack[++tok->indent] = col;

1436

            tok->altindstack[tok->indent] = altcol;

1437

1438

        else /* col < tok->indstack[tok->indent] */ {

1439

            /* Dedent -- any number, must be consistent */

1440

            while (tok->indent > 0 &&

Branch (1440:24): [True: 132k, False: 14.9k]

1441

col < tok->indstack[tok->indent]132k

) {

Branch (1441:21): [True: 81.6k, False: 50.7k]

1442

                tok->pendin--;

1443

                tok->indent--;

1444

1445

            if (col != tok->indstack[tok->indent]) {

Branch (1445:21): [True: 6, False: 65.7k]

1446

                tok->done = E_DEDENT;

1447

                tok->cur = tok->inp;

1448

                return ERRORTOKEN;

1449

1450

            if (altcol != tok->altindstack[tok->indent]) {

Branch (1450:21): [True: 0, False: 65.7k]

1451

                return indenterror(tok);

tok->start = tok->cur;

1458

1459

/* Return pending indents/dedents */

1460

if (tok->pendin != 0) {

Branch (1460:9): [True: 163k, False: 10.0M]

1461

    if (tok->pendin < 0) {

Branch (1461:13): [True: 81.6k, False: 81.6k]

1462

        tok->pendin++;

1463

        return DEDENT;

1464

1465

    else {

1466

        tok->pendin--;

1467

        return INDENT;

/* Peek ahead at the next character */

1472

c = tok_nextc(tok);

1473

tok_backup(tok, c);

1474

/* Check if we are closing an async function */

1475

if (tok->async_def

Branch (1475:9): [True: 141, False: 10.0M]

1476

&&

!blankline141

Branch (1476:12): [True: 135, False: 6]

1477

    /* Due to some implementation artifacts of type comments,

1478

     * a TYPE_COMMENT at the start of a function won't set an

1479

     * indentation level and it will produce a NEWLINE after it.

1480

     * To avoid spuriously ending an async function due to this,

1481

     * wait until we have some non-newline char in front of us. */

1482

&&

c != '\n'135

Branch (1482:12): [True: 114, False: 21]

1483

&&

tok->level == 0114

Branch (1483:12): [True: 75, False: 39]

1484

    /* There was a NEWLINE after ASYNC DEF,

1485

       so we're past the signature. */

1486

&&

tok->async_def_nl75

Branch (1486:12): [True: 36, False: 39]

1487

    /* Current indentation level is less than where

1488

       the async function was defined */

1489

&&

tok->async_def_indent >= tok->indent36

)

Branch (1489:12): [True: 9, False: 27]

1490

1491

    tok->async_def = 0;

1492

    tok->async_def_indent = 0;

1493

    tok->async_def_nl = 0;

again:

tok->start = NULL;

1498

/* Skip spaces */

1499

do {

1500

    c = tok_nextc(tok);

1501

} while (c == ' ' ||

c == '\t'10.0M

c == '\014'10.0M

);

Branch (1501:14): [True: 1.18M, False: 10.0M] Branch (1501:26): [True: 4, False: 10.0M] Branch (1501:39): [True: 0, False: 10.0M]

1502

1503

/* Set start of current token */

1504

tok->start = tok->cur - 1;

1505

1506

/* Skip comment, unless it's a type comment */

1507

if (c == '#') {

Branch (1507:9): [True: 38.2k, False: 9.99M]

1508

    const char *prefix, *p, *type_start;

1509

1510

    while (c != EOF &&

c != '\n'1.73M

) {

Branch (1510:16): [True: 1.73M, False: 1] Branch (1510:28): [True: 1.69M, False: 38.2k]

1511

        c = tok_nextc(tok);

1512

1513

1514

    if (tok->type_comments) {

Branch (1514:13): [True: 611, False: 37.6k]

1515

        p = tok->start;

1516

        prefix = type_comment_prefix;

1517

        while (*prefix &&

p < tok->cur4.88k

) {

Branch (1517:20): [True: 4.88k, False: 611] Branch (1517:31): [True: 4.88k, False: 0]

1518

            if (*prefix == ' ') {

Branch (1518:21): [True: 1.22k, False: 3.66k]

1519

                while (*p == ' ' ||

*p == '\t'1.22k

) {

Branch (1519:28): [True: 1.22k, False: 1.22k] Branch (1519:41): [True: 0, False: 1.22k]

1520

                    p++;

1521

1522

            } else if (*prefix == *p) {

Branch (1522:28): [True: 3.66k, False: 0]

1523

                p++;

1524

            } else {

1525

                break;

1526

1527

1528

            prefix++;

1529

1530

1531

        /* This is a type comment if we matched all of type_comment_prefix. */

1532

        if (!*prefix) {

Branch (1532:17): [True: 611, False: 0]

1533

            int is_type_ignore = 1;

1534

            const char *ignore_end = p + 6;

1535

            tok_backup(tok, c);  /* don't eat the newline or EOF */

1536

1537

            type_start = p;

1538

1539

            /* A TYPE_IGNORE is "type: ignore" followed by the end of the token

1540

             * or anything ASCII and non-alphanumeric. */

1541

            is_type_ignore = (

1542

                tok->cur >= ignore_end &&

memcmp(p, "ignore", 6) == 0165

Branch (1542:21): [True: 165, False: 446] Branch (1542:47): [True: 92, False: 73]

1543

&&

!(92

tok->cur > ignore_end92

Branch (1543:26): [True: 60, False: 32]

1544

&&

(60

(unsigned char)ignore_end[0] >= 12860

Py_ISALNUM51

(ignore_end[0]))));

Branch (1544:30): [True: 9, False: 51]

1545

1546

            if (is_type_ignore) {

Branch (1546:21): [True: 74, False: 537]

1547

                *p_start = ignore_end;

1548

                *p_end = tok->cur;

1549

1550

                /* If this type ignore is the only thing on the line, consume the newline also. */

1551

                if (blankline) {

Branch (1551:25): [True: 0, False: 74]

1552

                    tok_nextc(tok);

1553

                    tok->atbol = 1;

1554

1555

                return TYPE_IGNORE;

1556

            } else {

1557

                *p_start = type_start;  /* after type_comment_prefix */

1558

                *p_end = tok->cur;

1559

                return TYPE_COMMENT;

if (tok->done == E_INTERACT_STOP) {

Branch (1565:9): [True: 0, False: 10.0M]

1566

    return ENDMARKER;

1567

1568

1569

/* Check for EOF and errors now */

1570

if (c == EOF) {

Branch (1570:9): [True: 55.7k, False: 9.97M]

1571

    if (tok->level) {

Branch (1571:13): [True: 185, False: 55.5k]

1572

        return ERRORTOKEN;

1573

1574

    return tok->done == E_EOF ? ENDMARKER :

ERRORTOKEN0

;

Branch (1574:16): [True: 55.5k, False: 0]

1575

1576

1577

/* Identifier (most frequent token!) */

1578

nonascii = 0;

1579

if (is_potential_identifier_start(c)) {

1580

    /* Process the various legal combinations of b"", r"", u"", and f"". */

1581

    int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;

1582

    while (1) {

Branch (1582:16): [Folded - Ignored]

1583

        if (!(saw_b ||

saw_u2.33M

saw_f2.32M

) &&

(2.27M

c == 'b'2.27M

c == 'B'1.74M

))

Branch (1583:19): [True: 524k, False: 2.33M] Branch (1583:28): [True: 6.35k, False: 2.32M] Branch (1583:37): [True: 52.4k, False: 2.27M] Branch (1583:48): [True: 523k, False: 1.74M] Branch (1583:60): [True: 3.92k, False: 1.74M]

1584

            saw_b = 1;

1585

        /* Since this is a backwards compatibility support literal we don't

1586

           want to support it in arbitrary order like byte literals. */

1587

        else if (!(saw_b ||

saw_u1.80M

saw_r1.79M

saw_f1.74M

)

Branch (1587:24): [True: 524k, False: 1.80M] Branch (1587:33): [True: 6.35k, False: 1.79M] Branch (1587:42): [True: 53.3k, False: 1.74M] Branch (1587:51): [True: 47.2k, False: 1.69M]

1588

&&

(1.69M

c == 'u'1.69M

c == 'U'1.69M

)) {

Branch (1588:26): [True: 5.14k, False: 1.69M] Branch (1588:37): [True: 1.25k, False: 1.69M]

1589

            saw_u = 1;

1590

1591

        /* ur"" and ru"" are not supported */

1592

        else if (!(saw_r ||

saw_u2.26M

) &&

(2.26M

c == 'r'2.26M

c == 'R'2.20M

)) {

Branch (1592:24): [True: 54.7k, False: 2.26M] Branch (1592:33): [True: 6.35k, False: 2.26M] Branch (1592:44): [True: 54.0k, False: 2.20M] Branch (1592:56): [True: 1.55k, False: 2.20M]

1593

            saw_r = 1;

1594

1595

        else if (!(saw_f ||

saw_b2.21M

saw_u1.69M

) &&

(1.68M

c == 'f'1.68M

c == 'F'1.64M

)) {

Branch (1595:24): [True: 47.3k, False: 2.21M] Branch (1595:33): [True: 523k, False: 1.69M] Branch (1595:42): [True: 6.35k, False: 1.68M] Branch (1595:53): [True: 46.8k, False: 1.64M] Branch (1595:65): [True: 4.91k, False: 1.63M]

1596

            saw_f = 1;

1597

1598

        else {

1599

            break;

1600

1601

        c = tok_nextc(tok);

1602

        if (c == '"' ||

c == '''639k

) {

Branch (1602:17): [True: 2.49k, False: 639k] Branch (1602:29): [True: 7.24k, False: 631k]

1603

            goto letter_quote;

1604

1605

1606

    while (2.21M

is_potential_identifier_char(c)) {

1607

        if (c >= 128) {

Branch (1607:17): [True: 98, False: 5.69M]

1608

            nonascii = 1;

1609

1610

        c = tok_nextc(tok);

1611

1612

    tok_backup(tok, c);

1613

    if (nonascii &&

!verify_identifier(tok)48

) {

Branch (1613:13): [True: 48, False: 2.21M] Branch (1613:25): [True: 12, False: 36]

1614

        return ERRORTOKEN;

1615

1616

1617

    *p_start = tok->start;

1618

    *p_end = tok->cur;

1619

1620

    /* async/await parsing block. */

1621

    if (tok->cur - tok->start == 5 &&

tok->start[0] == 'a'75.4k

) {

Branch (1621:13): [True: 75.4k, False: 2.13M] Branch (1621:43): [True: 2.20k, False: 73.2k]

1622

        /* May be an 'async' or 'await' token.  For Python 3.7 or

1623

           later we recognize them unconditionally.  For Python

1624

           3.5 or 3.6 we recognize 'async' in front of 'def', and

1625

           either one inside of 'async def'.  (Technically we

1626

           shouldn't recognize these at all for 3.4 or earlier,

1627

           but there's no *valid* Python 3.4 code that would be

1628

           rejected, and async functions will be rejected in a

1629

           later phase.) */

1630

        if (!tok->async_hacks ||

tok->async_def24

) {

Branch (1630:17): [True: 2.18k, False: 24] Branch (1630:38): [True: 9, False: 15]

1631

            /* Always recognize the keywords. */

1632

            if (memcmp(tok->start, "async", 5) == 0) {

Branch (1632:21): [True: 957, False: 1.23k]

1633

                return ASYNC;

1634

1635

            if (memcmp(tok->start, "await", 5) == 0) {

Branch (1635:21): [True: 219, False: 1.01k]

1636

                return AWAIT;

1637

1638

1639

        else if (memcmp(tok->start, "async", 5) == 0) {

Branch (1639:22): [True: 12, False: 3]

1640

            /* The current token is 'async'.

1641

               Look ahead one token to see if that is 'def'. */

1642

1643

            struct tok_state ahead_tok;

1644

            const char *ahead_tok_start = NULL;

1645

            const char *ahead_tok_end = NULL;

1646

            int ahead_tok_kind;

1647

1648

            memcpy(&ahead_tok, tok, sizeof(ahead_tok));

1649

            ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,

1650

                                     &ahead_tok_end);

1651

1652

            if (ahead_tok_kind == NAME

Branch (1652:21): [True: 9, False: 3]

1653

&&

ahead_tok.cur - ahead_tok.start == 39

Branch (1653:24): [True: 9, False: 0]

1654

&&

memcmp(ahead_tok.start, "def", 3) == 09

)

Branch (1654:24): [True: 9, False: 0]

1655

1656

                /* The next token is going to be 'def', so instead of

1657

                   returning a plain NAME token, return ASYNC. */

1658

                tok->async_def_indent = tok->indent;

1659

                tok->async_def = 1;

1660

                return ASYNC;

    return NAME;

1666

1667

1668

/* Newline */

1669

if (c == '\n') {

Branch (1669:9): [True: 3.31M, False: 4.44M]

1670

    tok->atbol = 1;

1671

    if (blankline ||

tok->level > 0651k

) {

Branch (1671:13): [True: 2.65M, False: 651k] Branch (1671:26): [True: 114k, False: 536k]

1672

        goto nextline;

1673

1674

    *p_start = tok->start;

1675

    *p_end = tok->cur - 1; /* Leave '\n' out of the string */

1676

    tok->cont_line = 0;

1677

    if (tok->async_def) {

Branch (1677:13): [True: 21, False: 536k]

1678

        /* We're somewhere inside an 'async def' function, and

1679

           we've encountered a NEWLINE after its signature. */

1680

        tok->async_def_nl = 1;

1681

1682

    return NEWLINE;

1683

1684

1685

/* Period or number starting with period? */

1686

if (c == '.') {

Branch (1686:9): [True: 423k, False: 4.01M]

1687

    c = tok_nextc(tok);

1688

    if (isdigit(c)) {

1689

        goto fraction;

1690

    } else if (c == '.') {

Branch (1690:20): [True: 1.40k, False: 422k]

1691

        c = tok_nextc(tok);

1692

        if (c == '.') {

Branch (1692:17): [True: 1.39k, False: 15]

1693

            *p_start = tok->start;

1694

            *p_end = tok->cur;

1695

            return ELLIPSIS;

1696

1697

        else {

1698

            tok_backup(tok, c);

1699

1700

        tok_backup(tok, '.');

1701

1702

    else {

1703

        tok_backup(tok, c);

1704

1705

    *p_start = tok->start;

1706

    *p_end = tok->cur;

1707

    return DOT;

1708

1709

1710

/* Number */

1711

if (isdigit(c)) {

1712

    if (c == '0') {

Branch (1712:13): [True: 320k, False: 325k]

1713

        /* Hex, octal or binary -- maybe. */

1714

        c = tok_nextc(tok);

1715

        if (c == 'x' ||

c == 'X'318k

) {

Branch (1715:17): [True: 2.30k, False: 318k] Branch (1715:29): [True: 2, False: 318k]

1716

            /* Hex */

1717

            c = tok_nextc(tok);

1718

            do {

1719

                if (c == '_') {

Branch (1719:25): [True: 17, False: 2.30k]

1720

                    c = tok_nextc(tok);

1721

1722

                if (!isxdigit(c)) {

Branch (1722:25): [True: 16, False: 2.30k]

1723

                    tok_backup(tok, c);

1724

                    return syntaxerror(tok, "invalid hexadecimal literal");

1725

1726

do 2.30k

{

1727

                    c = tok_nextc(tok);

1728

                } while (isxdigit(c));

1729

            } while (c == '_');

Branch (1729:26): [True: 13, False: 2.29k]

1730

            if (!verify_end_of_number(tok, c, "hexadecimal")) {

Branch (1730:21): [True: 12, False: 2.28k]

1731

                return ERRORTOKEN;

1732

1733

1734

        else if (c == 'o' ||

c == 'O'318k

) {

Branch (1734:22): [True: 142, False: 318k] Branch (1734:34): [True: 3, False: 318k]

1735

            /* Octal */

1736

            c = tok_nextc(tok);

1737

            do {

1738

                if (c == '_') {

Branch (1738:25): [True: 12, False: 143]

1739

                    c = tok_nextc(tok);

1740

1741

                if (c < '0' ||

c >= '8'150

) {

Branch (1741:25): [True: 5, False: 150] Branch (1741:36): [True: 7, False: 143]

1742

                    if (isdigit(c)) {

1743

                        return syntaxerror(tok,

1744

                                "invalid digit '%c' in octal literal", c);

1745

1746

                    else {

1747

                        tok_backup(tok, c);

1748

                        return syntaxerror(tok, "invalid octal literal");

1749

1750

1751

do 143

{

1752

                    c = tok_nextc(tok);

1753

                } while ('0' <= c &&

c < '8'527

);

Branch (1753:30): [True: 527, False: 95] Branch (1753:42): [True: 479, False: 48]

1754

            } while (c == '_');

Branch (1754:26): [True: 10, False: 133]

1755

            if (isdigit(c)) {

1756

                return syntaxerror(tok,

1757

                        "invalid digit '%c' in octal literal", c);

1758

1759

            if (!verify_end_of_number(tok, c, "octal")) {

Branch (1759:21): [True: 11, False: 119]

1760

                return ERRORTOKEN;

1761

1762

1763

        else if (c == 'b' ||

c == 'B'318k

) {

Branch (1763:22): [True: 54, False: 318k] Branch (1763:34): [True: 3, False: 318k]

1764

            /* Binary */

1765

            c = tok_nextc(tok);

1766

            do {

1767

                if (c == '_') {

Branch (1767:25): [True: 10, False: 55]

1768

                    c = tok_nextc(tok);

1769

1770

                if (c != '0' &&

c != '1'59

) {

Branch (1770:25): [True: 59, False: 6] Branch (1770:37): [True: 12, False: 47]

1771

                    if (isdigit(c)) {

1772

                        return syntaxerror(tok,

1773

                                "invalid digit '%c' in binary literal", c);

1774

1775

                    else {

1776

                        tok_backup(tok, c);

1777

                        return syntaxerror(tok, "invalid binary literal");

1778

1779

1780

do 53

{

1781

                    c = tok_nextc(tok);

1782

                } while (c == '0' ||

c == '1'384

);

Branch (1782:30): [True: 235, False: 384] Branch (1782:42): [True: 331, False: 53]

1783

            } while (c == '_');

Branch (1783:26): [True: 8, False: 45]

1784

            if (isdigit(c)) {

1785

                return syntaxerror(tok,

1786

                        "invalid digit '%c' in binary literal", c);

1787

1788

            if (!verify_end_of_number(tok, c, "binary")) {

Branch (1788:21): [True: 11, False: 32]

1789

                return ERRORTOKEN;

1790

1791

1792

        else {

1793

            int nonzero = 0;

1794

            /* maybe old-style octal; c is first char of it */

1795

            /* in any case, allow '0' as a literal */

1796

            while (1) {

Branch (1796:24): [Folded - Ignored]

1797

                if (c == '_') {

Branch (1797:25): [True: 13, False: 318k]

1798

                    c = tok_nextc(tok);

1799

                    if (!isdigit(c)) {

Branch (1799:29): [True: 4, False: 9]

1800

                        tok_backup(tok, c);

1801

                        return syntaxerror(tok, "invalid decimal literal");

1802

1803

1804

                if (c != '0') {

Branch (1804:25): [True: 318k, False: 139]

1805

                    break;

1806

1807

                c = tok_nextc(tok);

1808

1809

            char* zeros_end = tok->cur;

1810

            if (isdigit(c)) {

1811

                nonzero = 1;

1812

                c = tok_decimal_tail(tok);

1813

                if (c == 0) {

Branch (1813:25): [True: 0, False: 26]

1814

                    return ERRORTOKEN;

1815

1816

1817

            if (c == '.') {

Branch (1817:21): [True: 445, False: 317k]

1818

                c = tok_nextc(tok);

1819

                goto fraction;

1820

1821

            else if (c == 'e' ||

c == 'E'317k

) {

Branch (1821:26): [True: 11, False: 317k] Branch (1821:38): [True: 2, False: 317k]

1822

                goto exponent;

1823

1824

            else if (c == 'j' ||

c == 'J'317k

) {

Branch (1824:26): [True: 128, False: 317k] Branch (1824:38): [True: 0, False: 317k]

1825

                goto imaginary;

1826

1827

            else if (nonzero) {

Branch (1827:26): [True: 13, False: 317k]

1828

                /* Old-style octal: now disallowed. */

1829

                tok_backup(tok, c);

1830

                return syntaxerror_known_range(

1831

                        tok, (int)(tok->start + 1 - tok->line_start),

1832

                        (int)(zeros_end - tok->line_start),

1833

                        "leading zeros in decimal integer "

1834

                        "literals are not permitted; "

1835

                        "use an 0o prefix for octal integers");

1836

1837

            if (!verify_end_of_number(tok, c, "decimal")) {

Branch (1837:21): [True: 7, False: 317k]

1838

                return ERRORTOKEN;

    else {

1843

        /* Decimal */

1844

        c = tok_decimal_tail(tok);

1845

        if (c == 0) {

Branch (1845:17): [True: 11, False: 325k]

1846

            return ERRORTOKEN;

1847

1848

1849

            /* Accept floating point numbers. */

1850

            if (c == '.') {

Branch (1850:21): [True: 2.05k, False: 323k]

1851

                c = tok_nextc(tok);

1852

    fraction:

1853

                /* Fraction */

1854

                if (isdigit(c)) {

1855

                    c = tok_decimal_tail(tok);

1856

                    if (c == 0) {

Branch (1856:29): [True: 10, False: 2.42k]

1857

                        return ERRORTOKEN;

            if (c == 'e' ||

c == 'E'324k

) {

Branch (1861:21): [True: 1.52k, False: 324k] Branch (1861:33): [True: 1.02k, False: 323k]

1862

                int e;

1863

              exponent:

1864

                e = c;

1865

                /* Exponent part */

1866

                c = tok_nextc(tok);

1867

                if (c == '+' ||

c == '-'1.95k

) {

Branch (1867:25): [True: 604, False: 1.95k] Branch (1867:37): [True: 1.23k, False: 714]

1868

                    c = tok_nextc(tok);

1869

                    if (!isdigit(c)) {

Branch (1869:29): [True: 8, False: 1.83k]

1870

                        tok_backup(tok, c);

1871

                        return syntaxerror(tok, "invalid decimal literal");

1872

1873

                } else

if (714

!isdigit714

(c)) {

Branch (1873:32): [True: 15, False: 699]

1874

                    tok_backup(tok, c);

1875

                    if (!verify_end_of_number(tok, e, "decimal")) {

Branch (1875:29): [True: 10, False: 5]

1876

                        return ERRORTOKEN;

1877

1878

                    tok_backup(tok, e);

1879

                    *p_start = tok->start;

1880

                    *p_end = tok->cur;

1881

                    return NUMBER;

1882

1883

                c = tok_decimal_tail(tok);

1884

                if (c == 0) {

Branch (1884:25): [True: 6, False: 2.52k]

1885

                    return ERRORTOKEN;

1886

1887

1888

            if (c == 'j' ||

c == 'J'325k

) {

Branch (1888:21): [True: 490, False: 325k] Branch (1888:33): [True: 0, False: 325k]

1889

                /* Imaginary part */

1890

    imaginary:

1891

                c = tok_nextc(tok);

1892

                if (!verify_end_of_number(tok, c, "imaginary")) {

Branch (1892:25): [True: 10, False: 608]

1893

                    return ERRORTOKEN;

1894

1895

1896

            else if (!verify_end_of_number(tok, c, "decimal")) {

Branch (1896:26): [True: 27, False: 325k]

1897

                return ERRORTOKEN;

    tok_backup(tok, c);

1902

    *p_start = tok->start;

1903

    *p_end = tok->cur;

1904

    return NUMBER;

letter_quote:

/* String */

1909

if (c == '\'' ||

c == '"'3.27M

) {

Branch (1909:9): [True: 107k, False: 3.27M] Branch (1909:22): [True: 98.8k, False: 3.17M]

1910

    int quote = c;

1911

    int quote_size = 1;             /* 1 or 3 */

1912

    int end_quote_size = 0;

1913

1914

    /* Nodes of type STRING, especially multi line strings

1915

       must be handled differently in order to get both

1916

       the starting line number and the column offset right.

1917

       (cf. issue 16806) */

1918

    tok->first_lineno = tok->lineno;

1919

    tok->multi_line_start = tok->line_start;

1920

1921

    /* Find the quote size and start of string */

1922

    c = tok_nextc(tok);

1923

    if (c == quote) {

Branch (1923:13): [True: 76.2k, False: 129k]

1924

        c = tok_nextc(tok);

1925

        if (c == quote) {

Branch (1925:17): [True: 8.52k, False: 67.6k]

1926

            quote_size = 3;

1927

1928

        else {

1929

            end_quote_size = 1;     /* empty string found */

1930

1931

1932

    if (c != quote) {

Branch (1932:13): [True: 197k, False: 8.52k]

1933

        tok_backup(tok, c);

1934

1935

1936

    /* Get rest of string */

1937

    while (end_quote_size != quote_size) {

Branch (1937:16): [True: 4.20M, False: 206k]

1938

        c = tok_nextc(tok);

1939

        if (c == EOF ||

(4.20M

quote_size == 14.20M

c == '\n'1.72M

)) {

Branch (1939:17): [True: 11, False: 4.20M] Branch (1939:30): [True: 1.72M, False: 2.48M] Branch (1939:49): [True: 5, False: 1.72M]

1940

            assert(tok->multi_line_start != NULL);

1941

            // shift the tok_state's location into

1942

            // the start of string, and report the error

1943

            // from the initial quote character

1944

            tok->cur = (char *)tok->start;

1945

            tok->cur++;

1946

            tok->line_start = tok->multi_line_start;

1947

            int start = tok->lineno;

1948

            tok->lineno = tok->first_lineno;

1949

            if (quote_size == 3) {

Branch (1949:21): [True: 5, False: 11]

1950

                syntaxerror(tok, "unterminated triple-quoted string literal"

1951

                                 " (detected at line %d)", start);

1952

                if (c != '\n') {

Branch (1952:25): [True: 5, False: 0]

1953

                    tok->done = E_EOFS;

1954

1955

                return ERRORTOKEN;

1956

1957

            else {

1958

                syntaxerror(tok, "unterminated string literal (detected at"

1959

                                 " line %d)", start);

1960

                if (c != '\n') {

Branch (1960:25): [True: 6, False: 5]

1961

                    tok->done = E_EOLS;

1962

1963

                return ERRORTOKEN;

1964

1965

1966

        if (c == quote) {

Branch (1966:17): [True: 160k, False: 4.04M]

1967

            end_quote_size += 1;

1968

1969

        else {

1970

            end_quote_size = 0;

1971

            if (c == '\\') {

Branch (1971:21): [True: 29.6k, False: 4.01M]

1972

                tok_nextc(tok);  /* skip escaped char */

    *p_start = tok->start;

1978

    *p_end = tok->cur;

1979

    return STRING;

1980

1981

1982

/* Line continuation */

1983

if (c == '\\') {

Branch (1983:9): [True: 463, False: 3.17M]

1984

    if ((c = tok_continuation_line(tok)) == -1) {

Branch (1984:13): [True: 15, False: 448]

1985

        return ERRORTOKEN;

1986

1987

    tok->cont_line = 1;

1988

    goto again; /* Read next line */

1989

1990

1991

/* Check for two-character token */

1992

1993

    int c2 = tok_nextc(tok);

1994

    int token = _PyToken_TwoChars(c, c2);

1995

    if (token != OP) {

Branch (1995:13): [True: 18.4k, False: 3.15M]

1996

        int c3 = tok_nextc(tok);

1997

        int token3 = _PyToken_ThreeChars(c, c2, c3);

1998

        if (token3 != OP) {

Branch (1998:17): [True: 89, False: 18.3k]

1999

            token = token3;

2000

2001

        else {

2002

            tok_backup(tok, c3);

2003

2004

        *p_start = tok->start;

2005

        *p_end = tok->cur;

2006

        return token;

2007

2008

    tok_backup(tok, c2);

2009

2010

2011

/* Keep track of parentheses nesting level */

2012

switch (c) {

Branch (2012:13): [True: 1.35M, False: 1.80M]

2013

case '(':

Branch (2013:5): [True: 565k, False: 2.58M]

2014

case '[':

Branch (2014:5): [True: 332k, False: 2.82M]

2015

case '{':

Branch (2015:5): [True: 4.78k, False: 3.15M]

2016

    if (tok->level >= MAXLEVEL) {

Branch (2016:13): [True: 1, False: 902k]

2017

        return syntaxerror(tok, "too many nested parentheses");

2018

2019

    tok->parenstack[tok->level] = c;

2020

    tok->parenlinenostack[tok->level] = tok->lineno;

2021

    tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);

2022

    tok->level++;

2023

    break;

2024

case ')':

Branch (2024:5): [True: 565k, False: 2.59M]

2025

case ']':

Branch (2025:5): [True: 332k, False: 2.82M]

2026

case '}':

Branch (2026:5): [True: 4.72k, False: 3.15M]

2027

    if (!tok->level) {

Branch (2027:13): [True: 12, False: 902k]

2028

        return syntaxerror(tok, "unmatched '%c'", c);

2029

2030

    tok->level--;

2031

    int opening = tok->parenstack[tok->level];

2032

    if (!((opening == '(' &&

c == ')'565k

) ||

Branch (2032:16): [True: 565k, False: 336k] Branch (2032:34): [True: 565k, False: 2]

2033

(336k

opening == '['336k

c == ']'332k

) ||

Branch (2033:16): [True: 332k, False: 4.72k] Branch (2033:34): [True: 332k, False: 2]

2034

(4.72k

opening == '{'4.72k

c == '}'4.72k

)))

Branch (2034:16): [True: 4.72k, False: 4] Branch (2034:34): [True: 4.72k, False: 1]

2035

2036

        if (tok->parenlinenostack[tok->level] != tok->lineno) {

Branch (2036:17): [True: 0, False: 5]

2037

            return syntaxerror(tok,

2038

                    "closing parenthesis '%c' does not match "

2039

                    "opening parenthesis '%c' on line %d",

2040

                    c, opening, tok->parenlinenostack[tok->level]);

2041

2042

        else {

2043

            return syntaxerror(tok,

2044

                    "closing parenthesis '%c' does not match "

2045

                    "opening parenthesis '%c'",

2046

                    c, opening);

2047

2048

2049

    break;

2050

2051

2052

if (!Py_UNICODE_ISPRINTABLE(c)) {

Branch (2052:9): [True: 1, False: 3.15M]

2053

    char hex[9];

2054

    (void)PyOS_snprintf(hex, sizeof(hex), "%04X", c);

2055

    return syntaxerror(tok, "invalid non-printable character U+%s", hex);

2056

2057

2058

/* Punctuation character */

2059

*p_start = tok->start;

2060

*p_end = tok->cur;

2061

return _PyToken_OneChar(c);

}

int

_PyTokenizer_Get(struct tok_state *tok,

2066

             const char **p_start, const char **p_end)

2067

{

2068

int result = tok_get(tok, p_start, p_end);

2069

if (tok->decoding_erred) {

Branch (2069:9): [True: 0, False: 7.42M]

2070

    result = ERRORTOKEN;

2071

    tok->done = E_DECODE;

2072

2073

return result;

2074

}

2075

2076

#if defined(wasi) || (defined(EMSCRIPTEN) && (EMSCRIPTEN_major >= 3))

2077

// fdopen() with borrowed fd. WASI does not provide dup() and Emscripten's

2078

// dup() emulation with open() is slow.

2079

typedef union {

2080

void *cookie;

2081

int fd;

} borrowed;

static ssize_t

borrow_read(void *cookie, char *buf, size_t size)

2086

{

2087

borrowed b = {.cookie = cookie};

2088

return read(b.fd, (void *)buf, size);

}

static FILE *

fdopen_borrow(int fd) {

2093

// supports only reading. seek fails. close and write are no-ops.

2094

cookie_io_functions_t io_cb = {borrow_read, NULL, NULL, NULL};

2095

borrowed b = {.fd = fd};

2096

return fopencookie(b.cookie, "r", io_cb);

}

#else

static FILE *

fdopen_borrow(int fd) {

2101

fd = _Py_dup(fd);

2102

if (fd < 0) {

Branch (2102:9): [True: 0, False: 256]

2103

    return NULL;

2104

2105

return fdopen(fd, "r");

}

#endif

/* Get the encoding of a Python file. Check for the coding cookie and check if

2110

the file starts with a BOM.

2111

2112

_PyTokenizer_FindEncodingFilename() returns NULL when it can't find the

2113

encoding in the first or second line of the file (in which case the encoding

2114

should be assumed to be UTF-8).

2115

2116

The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed

by the caller. */

char *

_PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)

2121

{

2122

struct tok_state *tok;

2123

FILE *fp;

2124

const char *p_start = NULL;

2125

const char *p_end = NULL;

2126

char *encoding = NULL;

2127

2128

fp = fdopen_borrow(fd);

2129

if (fp == NULL) {

Branch (2129:9): [True: 0, False: 256]

2130

    return NULL;

2131

2132

tok = _PyTokenizer_FromFile(fp, NULL, NULL, NULL);

2133

if (tok == NULL) {

Branch (2133:9): [True: 0, False: 256]

2134

    fclose(fp);

2135

    return NULL;

2136

2137

if (filename != NULL) {

Branch (2137:9): [True: 256, False: 0]

2138

    Py_INCREF(filename);

2139

    tok->filename = filename;

2140

2141

else {

2142

    tok->filename = PyUnicode_FromString("<string>");

2143

    if (tok->filename == NULL) {

Branch (2143:13): [True: 0, False: 0]

2144

        fclose(fp);

2145

        _PyTokenizer_Free(tok);

2146

        return encoding;

2147

2148

2149

while (256

tok->lineno < 2 &&

tok->done == 622

E_OK622

) {

Branch (2149:12): [True: 622, False: 254] Branch (2149:31): [True: 620, False: 2]

2150

    _PyTokenizer_Get(tok, &p_start, &p_end);

2151

2152

fclose(fp);

2153

if (tok->encoding) {

Branch (2153:9): [True: 0, False: 256]

2154

    encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);

2155

    if (encoding) {

Branch (2155:13): [True: 0, False: 0]

2156

        strcpy(encoding, tok->encoding);

2157

2158

2159

_PyTokenizer_Free(tok);

2160

return encoding;

}

#ifdef Py_DEBUG

void

tok_dump(int type, char *start, char *end)

2166

{

2167

fprintf(stderr, "%s", _PyParser_TokenNames[type]);

2168

if (type == NAME || type == NUMBER || type == STRING || type == OP)

2169

    fprintf(stderr, "(%.*s)", (int)(end - start), start);

2170

}

2171

#endif // Py_DEBUG