"""A parser for HTML and XHTML.""" 2 3 # This file is based on sgmllib.py, but the API is slightly different. 4 5 # XXX There should be a way to distinguish between PCDATA (parsed 6 # character data -- the normal case), RCDATA (replaceable character 7 # data -- only char and entity references and end tags are special) 8 # and CDATA (character data -- only end tags are special). 9 10 11 import markupbase 12 import re 13 14 # Regular expressions used for parsing 15 16 interesting_normal = re.compile('[&<]') 17 incomplete = re.compile('&[a-zA-Z#]') 18 19 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 20 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 21 22 starttagopen = re.compile('<[a-zA-Z]') 23 piclose = re.compile('>') 24 commentclose = re.compile(r'--\s*>') 25 tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') 26 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 27 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 28 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') 29 30 attrfind = re.compile( 31 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 32 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 33 34 locatestarttagend = re.compile(r""" 35 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 36 (?:[\s/]* # optional whitespace before attribute name 37 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 38 (?:\s*=+\s* # value indicator 39 (?:'[^']*' # LITA-enclosed value 40 |"[^"]*" # LIT-enclosed value 41 |(?!['"])[^>\s]* # bare value 42 ) 43 )?(?:\s|/(?!>))* 44 )* 45 )? 46 \s* # trailing whitespace 47 """, re.VERBOSE) 48 endendtag = re.compile('>') 49 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 50 # </ and the tag name, so maybe this should be fixed 51 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 52 53 54 class HTMLParseError(Exception): 55 """Exception raised for all parse errors.""" 56 57 def __init__(self, msg, position=(None, None)): 58 assert msg 59 self.msg = msg 60 self.lineno = position[0] 61 self.offset = position[1] 62 63 def __str__(self): 64 result = self.msg 65 if self.lineno is not None: 66 result = result + ", at line %d" % self.lineno 67 if self.offset is not None: 68 result = result + ", column %d" % (self.offset + 1) 69 return result 70 71 72 class HTMLParser(markupbase.ParserBase): 73 """Find tags and other markup and call handler functions. 74 75 Usage: 76 p = HTMLParser() 77 p.feed(data) 78 ... 79 p.close() 80 81 Start tags are handled by calling self.handle_starttag() or 82 self.handle_startendtag(); end tags by self.handle_endtag(). The 83 data between tags is passed from the parser to the derived class 84 by calling self.handle_data() with the data as argument (the data 85 may be split up in arbitrary chunks). Entity references are 86 passed by calling self.handle_entityref() with the entity 87 reference as the argument. Numeric character references are 88 passed to self.handle_charref() with the string containing the 89 reference as the argument. 90 """ 91 92 CDATA_CONTENT_ELEMENTS = ("script", "style") 93 94 95 def __init__(self): 96 """Initialize and reset this instance.""" 97 self.reset() 98 99 def reset(self): 100 """Reset this instance. Loses all unprocessed data.""" 101 self.rawdata = '' 102 self.lasttag = '???' 103 self.interesting = interesting_normal 104 self.cdata_elem = None 105 markupbase.ParserBase.reset(self) 106 107 def feed(self, data): 108 r"""Feed data to the parser. 109 110 Call this as often as you want, with as little or as much text 111 as you want (may include '\n'). 112 """ 113 self.rawdata = self.rawdata + data 114 self.goahead(0) 115 116 def close(self): 117 """Handle any buffered data.""" 118 self.goahead(1) 119 120 def error(self, message): 121 raise HTMLParseError(message, self.getpos()) 122 123 __starttag_text = None 124 125 def get_starttag_text(self): 126 """Return full source of start tag: '<...>'.""" 127 return self.__starttag_text 128 129 def set_cdata_mode(self, elem): 130 self.cdata_elem = elem.lower() 131 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 132 133 def clear_cdata_mode(self): 134 self.interesting = interesting_normal 135 self.cdata_elem = None 136 137 # Internal -- handle data as far as reasonable. May leave state 138 # and data to be processed by a subsequent call. If 'end' is 139 # true, force handling all data as if followed by EOF marker. 140 def goahead(self, end): 141 rawdata = self.rawdata 142 i = 0 143 n = len(rawdata) 144 while i < n: 145 match = self.interesting.search(rawdata, i) # < or & 146 if match: 147 j = match.start() 148 else: 149 if self.cdata_elem: 150 break 151 j = n 152 if i < j: self.handle_data(rawdata[i:j]) 153 i = self.updatepos(i, j) 154 if i == n: break 155 startswith = rawdata.startswith 156 if startswith('<', i): 157 if starttagopen.match(rawdata, i): # < + letter 158 k = self.parse_starttag(i) 159 elif startswith("</", i): 160 k = self.parse_endtag(i) 161 elif startswith("<!--", i): 162 k = self.parse_comment(i) 163 elif startswith("<?", i): 164 k = self.parse_pi(i) 165 elif startswith("<!", i): 166 k = self.parse_html_declaration(i) 167 elif (i + 1) < n: 168 self.handle_data("<") 169 k = i + 1 170 else: 171 break 172 if k < 0: 173 if not end: 174 break 175 k = rawdata.find('>', i + 1) 176 if k < 0: 177 k = rawdata.find('<', i + 1) 178 if k < 0: 179 k = i + 1 180 else: 181 k += 1 182 self.handle_data(rawdata[i:k]) 183 i = self.updatepos(i, k) 184 elif startswith("&#", i): 185 match = charref.match(rawdata, i) 186 if match: 187 name = match.group()[2:-1] 188 self.handle_charref(name) 189 k = match.end() 190 if not startswith(';', k-1): 191 k = k - 1 192 i = self.updatepos(i, k) 193 continue 194 else: 195 if ";" in rawdata[i:]: #bail by consuming &# 196 self.handle_data(rawdata[0:2]) 197 i = self.updatepos(i, 2) 198 break 199 elif startswith('&', i): 200 match = entityref.match(rawdata, i) 201 if match: 202 name = match.group(1) 203 self.handle_entityref(name) 204 k = match.end() 205 if not startswith(';', k-1): 206 k = k - 1 207 i = self.updatepos(i, k) 208 continue 209 match = incomplete.match(rawdata, i) 210 if match: 211 # match.group() will contain at least 2 chars 212 if end and match.group() == rawdata[i:]: 213 self.error("EOF in middle of entity or char ref") 214 # incomplete 215 break 216 elif (i + 1) < n: 217 # not the end of the buffer, and can't be confused 218 # with some other construct 219 self.handle_data("&") 220 i = self.updatepos(i, i + 1) 221 else: 222 break 223 else: 224 assert 0, "interesting.search() lied" 225 # end while 226 if end and i < n and not self.cdata_elem: 227 self.handle_data(rawdata[i:n]) 228 i = self.updatepos(i, n) 229 self.rawdata = rawdata[i:] 230 231 # Internal -- parse html declarations, return length or -1 if not terminated 232 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 233 # See also parse_declaration in _markupbase 234 def parse_html_declaration(self, i): 235 rawdata = self.rawdata 236 if rawdata[i:i+2] != '<!': 237 self.error('unexpected call to parse_html_declaration()') 238 if rawdata[i:i+4] == '<!--': 239 # this case is actually already handled in goahead() 240 return self.parse_comment(i) 241 elif rawdata[i:i+3] == '<![': 242 return self.parse_marked_section(i) 243 elif rawdata[i:i+9].lower() == '<!doctype': 244 # find the closing > 245 gtpos = rawdata.find('>', i+9) 246 if gtpos == -1: 247 return -1 248 self.handle_decl(rawdata[i+2:gtpos]) 249 return gtpos+1 250 else: 251 return self.parse_bogus_comment(i) 252 253 # Internal -- parse bogus comment, return length or -1 if not terminated 254 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 255 def parse_bogus_comment(self, i, report=1): 256 rawdata = self.rawdata 257 if rawdata[i:i+2] not in ('<!', '</'): 258 self.error('unexpected call to parse_comment()') 259 pos = rawdata.find('>', i+2) 260 if pos == -1: 261 return -1 262 if report: 263 self.handle_comment(rawdata[i+2:pos]) 264 return pos + 1 265 266 # Internal -- parse processing instr, return end or -1 if not terminated 267 def parse_pi(self, i): 268 rawdata = self.rawdata 269 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 270 match = piclose.search(rawdata, i+2) # > 271 if not match: 272 return -1 273 j = match.start() 274 self.handle_pi(rawdata[i+2: j]) 275 j = match.end() 276 return j 277 278 # Internal -- handle starttag, return end or -1 if not terminated 279 def parse_starttag(self, i): 280 self.__starttag_text = None 281 endpos = self.check_for_whole_start_tag(i) 282 if endpos < 0: 283 return endpos 284 rawdata = self.rawdata 285 self.__starttag_text = rawdata[i:endpos] 286 287 # Now parse the data between i+1 and j into a tag and attrs 288 attrs = [] 289 match = tagfind.match(rawdata, i+1) 290 assert match, 'unexpected call to parse_starttag()' 291 k = match.end() 292 self.lasttag = tag = match.group(1).lower() 293 294 while k < endpos: 295 m = attrfind.match(rawdata, k) 296 if not m: 297 break 298 attrname, rest, attrvalue = m.group(1, 2, 3) 299 if not rest: 300 attrvalue = None 301 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 302 attrvalue[:1] == '"' == attrvalue[-1:]: 303 attrvalue = attrvalue[1:-1] 304 if attrvalue: 305 attrvalue = self.unescape(attrvalue) 306 attrs.append((attrname.lower(), attrvalue)) 307 k = m.end() 308 309 end = rawdata[k:endpos].strip() 310 if end not in (">", "/>"): 311 lineno, offset = self.getpos() 312 if "\n" in self.__starttag_text: 313 lineno = lineno + self.__starttag_text.count("\n") 314 offset = len(self.__starttag_text) \ 315 - self.__starttag_text.rfind("\n") 316 else: 317 offset = offset + len(self.__starttag_text) 318 self.handle_data(rawdata[i:endpos]) 319 return endpos 320 if end.endswith('/>'): 321 # XHTML-style empty tag: <span attr="value" /> 322 self.handle_startendtag(tag, attrs) 323 else: 324 self.handle_starttag(tag, attrs) 325 if tag in self.CDATA_CONTENT_ELEMENTS: 326 self.set_cdata_mode(tag) 327 return endpos 328 329 # Internal -- check to see if we have a complete starttag; return end 330 # or -1 if incomplete. 331 def check_for_whole_start_tag(self, i): 332 rawdata = self.rawdata 333 m = locatestarttagend.match(rawdata, i) 334 if m: 335 j = m.end() 336 next = rawdata[j:j+1] 337 if next == ">": 338 return j + 1 339 if next == "/": 340 if rawdata.startswith("/>", j): 341 return j + 2 342 if rawdata.startswith("/", j): 343 # buffer boundary 344 return -1 345 # else bogus input 346 self.updatepos(i, j + 1) 347 self.error("malformed empty start tag") 348 if next == "": 349 # end of input 350 return -1 351 if next in ("abcdefghijklmnopqrstuvwxyz=/" 352 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 353 # end of input in or before attribute value, or we have the 354 # '/' from a '/>' ending 355 return -1 356 if j > i: 357 return j 358 else: 359 return i + 1 360 raise AssertionError("we should not get here!") 361 362 # Internal -- parse endtag, return end or -1 if incomplete 363 def parse_endtag(self, i): 364 rawdata = self.rawdata 365 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 366 match = endendtag.search(rawdata, i+1) # > 367 if not match: 368 return -1 369 gtpos = match.end() 370 match = endtagfind.match(rawdata, i) # </ + tag + > 371 if not match: 372 if self.cdata_elem is not None: 373 self.handle_data(rawdata[i:gtpos]) 374 return gtpos 375 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state 376 namematch = tagfind_tolerant.match(rawdata, i+2) 377 if not namematch: 378 # w3.org/TR/html5/tokenization.html#end-tag-open-state 379 if rawdata[i:i+3] == '</>': 380 return i+3 381 else: 382 return self.parse_bogus_comment(i) 383 tagname = namematch.group().lower() 384 # consume and ignore other stuff between the name and the > 385 # Note: this is not 100% correct, since we might have things like 386 # </tag attr=">">, but looking for > after tha name should cover 387 # most of the cases and is much simpler 388 gtpos = rawdata.find('>', namematch.end()) 389 self.handle_endtag(tagname) 390 return gtpos+1 391 392 elem = match.group(1).lower() # script or style 393 if self.cdata_elem is not None: 394 if elem != self.cdata_elem: 395 self.handle_data(rawdata[i:gtpos]) 396 return gtpos 397 398 self.handle_endtag(elem) 399 self.clear_cdata_mode() 400 return gtpos 401 402 # Overridable -- finish processing of start+end tag: <tag.../> 403 def handle_startendtag(self, tag, attrs): 404 self.handle_starttag(tag, attrs) 405 self.handle_endtag(tag) 406 407 # Overridable -- handle start tag 408 def handle_starttag(self, tag, attrs): 409 pass 410 411 # Overridable -- handle end tag 412 def handle_endtag(self, tag): 413 pass 414 415 # Overridable -- handle character reference 416 def handle_charref(self, name): 417 pass 418 419 # Overridable -- handle entity reference 420 def handle_entityref(self, name): 421 pass 422 423 # Overridable -- handle data 424 def handle_data(self, data): 425 pass 426 427 # Overridable -- handle comment 428 def handle_comment(self, data): 429 pass 430 431 # Overridable -- handle declaration 432 def handle_decl(self, decl): 433 pass 434 435 # Overridable -- handle processing instruction 436 def handle_pi(self, data): 437 pass 438 439 def unknown_decl(self, data): 440 pass 441 442 # Internal -- helper to remove special character quoting 443 entitydefs = None 444 def unescape(self, s): 445 if '&' not in s: 446 return s 447 def replaceEntities(s): 448 s = s.groups()[0] 449 try: 450 if s[0] == "#": 451 s = s[1:] 452 if s[0] in ['x','X']: 453 c = int(s[1:], 16) 454 else: 455 c = int(s) 456 return unichr(c) 457 except ValueError: 458 return '&#'+s+';' 459 else: 460 # Cannot use name2codepoint directly, because HTMLParser supports apos, 461 # which is not part of HTML 4 462 import htmlentitydefs 463 if HTMLParser.entitydefs is None: 464 entitydefs = HTMLParser.entitydefs = {'apos':u"'"} 465 for k, v in htmlentitydefs.name2codepoint.iteritems(): 466 entitydefs[k] = unichr(v) 467 try: 468 return self.entitydefs[s] 469 except KeyError: 470 return '&'+s+';' 471 472 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)