#15114: the strict mode and argument of HTMLParser, HTMLParser.error,… · python/cpython@73a4359 (original) (raw)

`@@ -29,35 +29,15 @@

29

`piclose = re.compile('>')

30

`commentclose = re.compile(r'--\s*>')

31

`# Note:

32

1) the strict attrfind isn't really strict, but we can't make it

33

correctly strict without breaking backward compatibility;

34

2) if you change tagfind/attrfind remember to update locatestarttagend too;

35

3) if you change tagfind/attrfind and/or locatestarttagend the parser will

32

1) if you change tagfind/attrfind remember to update locatestarttagend too;

33

2) if you change tagfind/attrfind and/or locatestarttagend the parser will

36

34

`# explode, so don't do it.

37

tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_])(?:\s|/(?!>))')

38

35

`# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state

39

36

`# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state

40

37

`tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00])(?:\s|/(?!>))')

41

attrfind = re.compile(

42

r'\s*([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s*'

43

`` -

r'('[^']'|"[^"]"|[^\s"'=<>`]*))?')

44

38

`attrfind_tolerant = re.compile(

45

39

`r'((?<=['"\s/])[^\s/>][^\s/=>])(\s=+\s*'

46

40

`r'('[^']'|"[^"]"|(?!['"])[^>\s]))?(?:\s|/(?!>))')

47

locatestarttagend = re.compile(r"""

48

<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name

49

(?:\s+ # whitespace before attribute name

50

(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name

51

(?:\s*=\s* # value indicator

52

(?:'[^']*' # LITA-enclosed value

53

|"[^"]*" # LIT-enclosed value

54

|[^'">\s]+ # bare value

55

)

56

57

)

58

59

\s* # trailing whitespace

60

""", re.VERBOSE)

61

41

`locatestarttagend_tolerant = re.compile(r"""

62

42

` <[a-zA-Z][^\t\n\r\f />\x00]* # tag name

63

43

` (?:[\s/]* # optional whitespace before attribute name

`@@ -79,24 +59,6 @@

79

59

`endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')

80

60

81

61

82

class HTMLParseError(Exception):

83

"""Exception raised for all parse errors."""

84

-

85

def init(self, msg, position=(None, None)):

86

assert msg

87

self.msg = msg

88

self.lineno = position[0]

89

self.offset = position[1]

90

-

91

def str(self):

92

result = self.msg

93

if self.lineno is not None:

94

result = result + ", at line %d" % self.lineno

95

if self.offset is not None:

96

result = result + ", column %d" % (self.offset + 1)

97

return result

98

-

99

-

100

62

`_default_sentinel = object()

101

63

102

64

`class HTMLParser(_markupbase.ParserBase):

`@@ -123,22 +85,12 @@ class HTMLParser(_markupbase.ParserBase):

123

85

124

86

`CDATA_CONTENT_ELEMENTS = ("script", "style")

125

87

126

def init(self, strict=_default_sentinel, *,

127

convert_charrefs=_default_sentinel):

88

def init(self, *, convert_charrefs=_default_sentinel):

128

89

`"""Initialize and reset this instance.

129

90

130

91

` If convert_charrefs is True (default: False), all character references

131

92

` are automatically converted to the corresponding Unicode characters.

132

If strict is set to False (the default) the parser will parse invalid

133

markup, otherwise it will raise an error. Note that the strict mode

134

and argument are deprecated.

135

93

` """

136

if strict is not _default_sentinel:

137

warnings.warn("The strict argument and mode are deprecated.",

138

DeprecationWarning, stacklevel=2)

139

else:

140

strict = False # default

141

self.strict = strict

142

94

`if convert_charrefs is _default_sentinel:

143

95

`convert_charrefs = False # default

144

96

`warnings.warn("The value of convert_charrefs will become True in "

`@@ -168,11 +120,6 @@ def close(self):

168

120

`"""Handle any buffered data."""

169

121

`self.goahead(1)

170

122

171

def error(self, message):

172

warnings.warn("The 'error' method is deprecated.",

173

DeprecationWarning, stacklevel=2)

174

raise HTMLParseError(message, self.getpos())

175

-

176

123

`__starttag_text = None

177

124

178

125

`def get_starttag_text(self):

`@@ -227,10 +174,7 @@ def goahead(self, end):

227

174

`elif startswith("<?", i):

228

175

`k = self.parse_pi(i)

229

176

`elif startswith("<!", i):

230

if self.strict:

231

k = self.parse_declaration(i)

232

else:

233

k = self.parse_html_declaration(i)

177

k = self.parse_html_declaration(i)

234

178

`elif (i + 1) < n:

235

179

`self.handle_data("<")

236

180

`k = i + 1

`@@ -239,8 +183,6 @@ def goahead(self, end):

239

183

`if k < 0:

240

184

`if not end:

241

185

`break

242

if self.strict:

243

self.error("EOF in middle of construct")

244

186

`k = rawdata.find('>', i + 1)

245

187

`if k < 0:

246

188

`k = rawdata.find('<', i + 1)

`@@ -282,13 +224,10 @@ def goahead(self, end):

282

224

`if match:

283

225

`# match.group() will contain at least 2 chars

284

226

`if end and match.group() == rawdata[i:]:

285

if self.strict:

286

self.error("EOF in middle of entity or char ref")

287

else:

288

k = match.end()

289

if k <= i:

290

k = n

291

i = self.updatepos(i, i + 1)

227

k = match.end()

228

if k <= i:

229

k = n

230

i = self.updatepos(i, i + 1)

292

231

`# incomplete

293

232

`break

294

233

`elif (i + 1) < n:

`@@ -367,18 +306,12 @@ def parse_starttag(self, i):

367

306

368

307

`# Now parse the data between i+1 and j into a tag and attrs

369

308

`attrs = []

370

if self.strict:

371

match = tagfind.match(rawdata, i+1)

372

else:

373

match = tagfind_tolerant.match(rawdata, i+1)

309

match = tagfind_tolerant.match(rawdata, i+1)

374

310

`assert match, 'unexpected call to parse_starttag()'

375

311

`k = match.end()

376

312

`self.lasttag = tag = match.group(1).lower()

377

313

`while k < endpos:

378

if self.strict:

379

m = attrfind.match(rawdata, k)

380

else:

381

m = attrfind_tolerant.match(rawdata, k)

314

m = attrfind_tolerant.match(rawdata, k)

382

315

`if not m:

383

316

`break

384

317

`attrname, rest, attrvalue = m.group(1, 2, 3)

`@@ -401,9 +334,6 @@ def parse_starttag(self, i):

401

334

`- self.__starttag_text.rfind("\n")

402

335

`else:

403

336

`offset = offset + len(self.__starttag_text)

404

if self.strict:

405

self.error("junk characters in start tag: %r"

406

% (rawdata[k:endpos][:20],))

407

337

`self.handle_data(rawdata[i:endpos])

408

338

`return endpos

409

339

`if end.endswith('/>'):

`@@ -419,10 +349,7 @@ def parse_starttag(self, i):

419

349

`# or -1 if incomplete.

420

350

`def check_for_whole_start_tag(self, i):

421

351

`rawdata = self.rawdata

422

if self.strict:

423

m = locatestarttagend.match(rawdata, i)

424

else:

425

m = locatestarttagend_tolerant.match(rawdata, i)

352

m = locatestarttagend_tolerant.match(rawdata, i)

426

353

`if m:

427

354

`j = m.end()

428

355

`next = rawdata[j:j+1]

`@@ -435,9 +362,6 @@ def check_for_whole_start_tag(self, i):

435

362

`# buffer boundary

436

363

`return -1

437

364

`# else bogus input

438

if self.strict:

439

self.updatepos(i, j + 1)

440

self.error("malformed empty start tag")

441

365

`if j > i:

442

366

`return j

443

367

`else:

`@@ -450,9 +374,6 @@ def check_for_whole_start_tag(self, i):

450

374

`# end of input in or before attribute value, or we have the

451

375

`# '/' from a '/>' ending

452

376

`return -1

453

if self.strict:

454

self.updatepos(i, j)

455

self.error("malformed start tag")

456

377

`if j > i:

457

378

`return j

458

379

`else:

`@@ -472,8 +393,6 @@ def parse_endtag(self, i):

472

393

`if self.cdata_elem is not None:

473

394

`self.handle_data(rawdata[i:gtpos])

474

395

`return gtpos

475

if self.strict:

476

self.error("bad end tag: %r" % (rawdata[i:gtpos],))

477

396

`# find the name: w3.org/TR/html5/tokenization.html#tag-name-state

478

397

`namematch = tagfind_tolerant.match(rawdata, i+2)

479

398

`if not namematch:

`@@ -539,8 +458,7 @@ def handle_pi(self, data):

539

458

`pass

540

459

541

460

`def unknown_decl(self, data):

542

if self.strict:

543

self.error("unknown declaration: %r" % (data,))

461

pass

544

462

545

463

`# Internal -- helper to remove special character quoting

546

464

`def unescape(self, s):