#15114: the strict mode and argument of HTMLParser, HTMLParser.error,… · python/cpython@73a4359 (original) (raw)

`@@ -29,35 +29,15 @@

`

29

29

`piclose = re.compile('>')

`

30

30

`commentclose = re.compile(r'--\s*>')

`

31

31

`# Note:

`

32

``

`-

1) the strict attrfind isn't really strict, but we can't make it

`

33

``

`-

correctly strict without breaking backward compatibility;

`

34

``

`-

2) if you change tagfind/attrfind remember to update locatestarttagend too;

`

35

``

`-

3) if you change tagfind/attrfind and/or locatestarttagend the parser will

`

``

32

`+

1) if you change tagfind/attrfind remember to update locatestarttagend too;

`

``

33

`+

2) if you change tagfind/attrfind and/or locatestarttagend the parser will

`

36

34

`# explode, so don't do it.

`

37

``

`-

tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_])(?:\s|/(?!>))')

`

38

35

`# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state

`

39

36

`# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state

`

40

37

`tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00])(?:\s|/(?!>))')

`

41

``

`-

attrfind = re.compile(

`

42

``

`-

r'\s*([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s*'

`

43

``

`` -

r'('[^']'|"[^"]"|[^\s"'=<>`]*))?')

``

44

38

`attrfind_tolerant = re.compile(

`

45

39

`r'((?<=['"\s/])[^\s/>][^\s/=>])(\s=+\s*'

`

46

40

`r'('[^']'|"[^"]"|(?!['"])[^>\s]))?(?:\s|/(?!>))')

`

47

``

`-

locatestarttagend = re.compile(r"""

`

48

``

`-

<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name

`

49

``

`-

(?:\s+ # whitespace before attribute name

`

50

``

`-

(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name

`

51

``

`-

(?:\s*=\s* # value indicator

`

52

``

`-

(?:'[^']*' # LITA-enclosed value

`

53

``

`-

|"[^"]*" # LIT-enclosed value

`

54

``

`-

|[^'">\s]+ # bare value

`

55

``

`-

)

`

56

``

`-

)?

`

57

``

`-

)

`

58

``

`-

)*

`

59

``

`-

\s* # trailing whitespace

`

60

``

`-

""", re.VERBOSE)

`

61

41

`locatestarttagend_tolerant = re.compile(r"""

`

62

42

` <[a-zA-Z][^\t\n\r\f />\x00]* # tag name

`

63

43

` (?:[\s/]* # optional whitespace before attribute name

`

`@@ -79,24 +59,6 @@

`

79

59

`endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')

`

80

60

``

81

61

``

82

``

`-

class HTMLParseError(Exception):

`

83

``

`-

"""Exception raised for all parse errors."""

`

84

``

-

85

``

`-

def init(self, msg, position=(None, None)):

`

86

``

`-

assert msg

`

87

``

`-

self.msg = msg

`

88

``

`-

self.lineno = position[0]

`

89

``

`-

self.offset = position[1]

`

90

``

-

91

``

`-

def str(self):

`

92

``

`-

result = self.msg

`

93

``

`-

if self.lineno is not None:

`

94

``

`-

result = result + ", at line %d" % self.lineno

`

95

``

`-

if self.offset is not None:

`

96

``

`-

result = result + ", column %d" % (self.offset + 1)

`

97

``

`-

return result

`

98

``

-

99

``

-

100

62

`_default_sentinel = object()

`

101

63

``

102

64

`class HTMLParser(_markupbase.ParserBase):

`

`@@ -123,22 +85,12 @@ class HTMLParser(_markupbase.ParserBase):

`

123

85

``

124

86

`CDATA_CONTENT_ELEMENTS = ("script", "style")

`

125

87

``

126

``

`-

def init(self, strict=_default_sentinel, *,

`

127

``

`-

convert_charrefs=_default_sentinel):

`

``

88

`+

def init(self, *, convert_charrefs=_default_sentinel):

`

128

89

`"""Initialize and reset this instance.

`

129

90

``

130

91

` If convert_charrefs is True (default: False), all character references

`

131

92

` are automatically converted to the corresponding Unicode characters.

`

132

``

`-

If strict is set to False (the default) the parser will parse invalid

`

133

``

`-

markup, otherwise it will raise an error. Note that the strict mode

`

134

``

`-

and argument are deprecated.

`

135

93

` """

`

136

``

`-

if strict is not _default_sentinel:

`

137

``

`-

warnings.warn("The strict argument and mode are deprecated.",

`

138

``

`-

DeprecationWarning, stacklevel=2)

`

139

``

`-

else:

`

140

``

`-

strict = False # default

`

141

``

`-

self.strict = strict

`

142

94

`if convert_charrefs is _default_sentinel:

`

143

95

`convert_charrefs = False # default

`

144

96

`warnings.warn("The value of convert_charrefs will become True in "

`

`@@ -168,11 +120,6 @@ def close(self):

`

168

120

`"""Handle any buffered data."""

`

169

121

`self.goahead(1)

`

170

122

``

171

``

`-

def error(self, message):

`

172

``

`-

warnings.warn("The 'error' method is deprecated.",

`

173

``

`-

DeprecationWarning, stacklevel=2)

`

174

``

`-

raise HTMLParseError(message, self.getpos())

`

175

``

-

176

123

`__starttag_text = None

`

177

124

``

178

125

`def get_starttag_text(self):

`

`@@ -227,10 +174,7 @@ def goahead(self, end):

`

227

174

`elif startswith("<?", i):

`

228

175

`k = self.parse_pi(i)

`

229

176

`elif startswith("<!", i):

`

230

``

`-

if self.strict:

`

231

``

`-

k = self.parse_declaration(i)

`

232

``

`-

else:

`

233

``

`-

k = self.parse_html_declaration(i)

`

``

177

`+

k = self.parse_html_declaration(i)

`

234

178

`elif (i + 1) < n:

`

235

179

`self.handle_data("<")

`

236

180

`k = i + 1

`

`@@ -239,8 +183,6 @@ def goahead(self, end):

`

239

183

`if k < 0:

`

240

184

`if not end:

`

241

185

`break

`

242

``

`-

if self.strict:

`

243

``

`-

self.error("EOF in middle of construct")

`

244

186

`k = rawdata.find('>', i + 1)

`

245

187

`if k < 0:

`

246

188

`k = rawdata.find('<', i + 1)

`

`@@ -282,13 +224,10 @@ def goahead(self, end):

`

282

224

`if match:

`

283

225

`# match.group() will contain at least 2 chars

`

284

226

`if end and match.group() == rawdata[i:]:

`

285

``

`-

if self.strict:

`

286

``

`-

self.error("EOF in middle of entity or char ref")

`

287

``

`-

else:

`

288

``

`-

k = match.end()

`

289

``

`-

if k <= i:

`

290

``

`-

k = n

`

291

``

`-

i = self.updatepos(i, i + 1)

`

``

227

`+

k = match.end()

`

``

228

`+

if k <= i:

`

``

229

`+

k = n

`

``

230

`+

i = self.updatepos(i, i + 1)

`

292

231

`# incomplete

`

293

232

`break

`

294

233

`elif (i + 1) < n:

`

`@@ -367,18 +306,12 @@ def parse_starttag(self, i):

`

367

306

``

368

307

`# Now parse the data between i+1 and j into a tag and attrs

`

369

308

`attrs = []

`

370

``

`-

if self.strict:

`

371

``

`-

match = tagfind.match(rawdata, i+1)

`

372

``

`-

else:

`

373

``

`-

match = tagfind_tolerant.match(rawdata, i+1)

`

``

309

`+

match = tagfind_tolerant.match(rawdata, i+1)

`

374

310

`assert match, 'unexpected call to parse_starttag()'

`

375

311

`k = match.end()

`

376

312

`self.lasttag = tag = match.group(1).lower()

`

377

313

`while k < endpos:

`

378

``

`-

if self.strict:

`

379

``

`-

m = attrfind.match(rawdata, k)

`

380

``

`-

else:

`

381

``

`-

m = attrfind_tolerant.match(rawdata, k)

`

``

314

`+

m = attrfind_tolerant.match(rawdata, k)

`

382

315

`if not m:

`

383

316

`break

`

384

317

`attrname, rest, attrvalue = m.group(1, 2, 3)

`

`@@ -401,9 +334,6 @@ def parse_starttag(self, i):

`

401

334

`- self.__starttag_text.rfind("\n")

`

402

335

`else:

`

403

336

`offset = offset + len(self.__starttag_text)

`

404

``

`-

if self.strict:

`

405

``

`-

self.error("junk characters in start tag: %r"

`

406

``

`-

% (rawdata[k:endpos][:20],))

`

407

337

`self.handle_data(rawdata[i:endpos])

`

408

338

`return endpos

`

409

339

`if end.endswith('/>'):

`

`@@ -419,10 +349,7 @@ def parse_starttag(self, i):

`

419

349

`# or -1 if incomplete.

`

420

350

`def check_for_whole_start_tag(self, i):

`

421

351

`rawdata = self.rawdata

`

422

``

`-

if self.strict:

`

423

``

`-

m = locatestarttagend.match(rawdata, i)

`

424

``

`-

else:

`

425

``

`-

m = locatestarttagend_tolerant.match(rawdata, i)

`

``

352

`+

m = locatestarttagend_tolerant.match(rawdata, i)

`

426

353

`if m:

`

427

354

`j = m.end()

`

428

355

`next = rawdata[j:j+1]

`

`@@ -435,9 +362,6 @@ def check_for_whole_start_tag(self, i):

`

435

362

`# buffer boundary

`

436

363

`return -1

`

437

364

`# else bogus input

`

438

``

`-

if self.strict:

`

439

``

`-

self.updatepos(i, j + 1)

`

440

``

`-

self.error("malformed empty start tag")

`

441

365

`if j > i:

`

442

366

`return j

`

443

367

`else:

`

`@@ -450,9 +374,6 @@ def check_for_whole_start_tag(self, i):

`

450

374

`# end of input in or before attribute value, or we have the

`

451

375

`# '/' from a '/>' ending

`

452

376

`return -1

`

453

``

`-

if self.strict:

`

454

``

`-

self.updatepos(i, j)

`

455

``

`-

self.error("malformed start tag")

`

456

377

`if j > i:

`

457

378

`return j

`

458

379

`else:

`

`@@ -472,8 +393,6 @@ def parse_endtag(self, i):

`

472

393

`if self.cdata_elem is not None:

`

473

394

`self.handle_data(rawdata[i:gtpos])

`

474

395

`return gtpos

`

475

``

`-

if self.strict:

`

476

``

`-

self.error("bad end tag: %r" % (rawdata[i:gtpos],))

`

477

396

`# find the name: w3.org/TR/html5/tokenization.html#tag-name-state

`

478

397

`namematch = tagfind_tolerant.match(rawdata, i+2)

`

479

398

`if not namematch:

`

`@@ -539,8 +458,7 @@ def handle_pi(self, data):

`

539

458

`pass

`

540

459

``

541

460

`def unknown_decl(self, data):

`

542

``

`-

if self.strict:

`

543

``

`-

self.error("unknown declaration: %r" % (data,))

`

``

461

`+

pass

`

544

462

``

545

463

`# Internal -- helper to remove special character quoting

`

546

464

`def unescape(self, s):

`