Mercurial > cpython
changeset 74946:3d7904e3f4b9 2.7
#13987: HTMLParser is now able to handle malformed start tags. [#13987]
| author | Ezio Melotti <ezio.melotti@gmail.com> |
|---|---|
| date | Wed, 15 Feb 2012 13:19:10 +0200 |
| parents | 11a31eb5da93 |
| children | 96f5718bf005 |
| files | Lib/HTMLParser.py Lib/test/test_htmlparser.py Misc/NEWS |
| diffstat | 3 files changed, 9 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/Lib/HTMLParser.py +++ b/Lib/HTMLParser.py @@ -315,8 +315,8 @@ class HTMLParser(markupbase.ParserBase): - self.__starttag_text.rfind("\n") else: offset = offset + len(self.__starttag_text) - self.error("junk characters in start tag: %r" - % (rawdata[k:endpos][:20],)) + self.handle_data(rawdata[i:endpos]) + return endpos if end.endswith('/>'): # XHTML-style empty tag: <span attr="value" /> self.handle_startendtag(tag, attrs) @@ -353,8 +353,10 @@ class HTMLParser(markupbase.ParserBase): # end of input in or before attribute value, or we have the # '/' from a '/>' ending return -1 - self.updatepos(i, j) - self.error("malformed start tag") + if j > i: + return j + else: + return i + 1 raise AssertionError("we should not get here!") # Internal -- parse endtag, return end or -1 if incomplete
--- a/Lib/test/test_htmlparser.py +++ b/Lib/test/test_htmlparser.py @@ -206,7 +206,8 @@ text self._run_check("</$>", [('comment', '$')]) self._run_check("</", [('data', '</')]) self._run_check("</a", [('data', '</a')]) - self._parse_error("<a<a>") + # XXX this might be wrong + self._run_check("<a<a>", [('data', '<a'), ('starttag', 'a', [])]) self._run_check("</a<a>", [('endtag', 'a<a')]) self._run_check("<!", [('data', '<!')]) self._run_check("<a", [('data', '<a')])
