zoukankan      html  css  js  c++  java
  • Lib/HTMLParser.py source code

    """A parser for HTML and XHTML."""
         2 
         3 # This file is based on sgmllib.py, but the API is slightly different.
         4 
         5 # XXX There should be a way to distinguish between PCDATA (parsed
         6 # character data -- the normal case), RCDATA (replaceable character
         7 # data -- only char and entity references and end tags are special)
         8 # and CDATA (character data -- only end tags are special).
         9 
        10 
        11 import markupbase
        12 import re
        13 
        14 # Regular expressions used for parsing
        15 
        16 interesting_normal = re.compile('[&<]')
        17 incomplete = re.compile('&[a-zA-Z#]')
        18 
        19 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
        20 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
        21 
        22 starttagopen = re.compile('<[a-zA-Z]')
        23 piclose = re.compile('>')
        24 commentclose = re.compile(r'--\s*>')
        25 tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
        26 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
        27 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
        28 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
        29 
        30 attrfind = re.compile(
        31     r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
        32     r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
        33 
        34 locatestarttagend = re.compile(r"""
        35   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
        36   (?:[\s/]*                          # optional whitespace before attribute name
        37     (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
        38       (?:\s*=+\s*                    # value indicator
        39         (?:'[^']*'                   # LITA-enclosed value
        40           |"[^"]*"                   # LIT-enclosed value
        41           |(?!['"])[^>\s]*           # bare value
        42          )
        43        )?(?:\s|/(?!>))*
        44      )*
        45    )?
        46   \s*                                # trailing whitespace
        47 """, re.VERBOSE)
        48 endendtag = re.compile('>')
        49 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
        50 # </ and the tag name, so maybe this should be fixed
        51 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
        52 
        53 
        54 class HTMLParseError(Exception):
        55     """Exception raised for all parse errors."""
        56 
        57     def __init__(self, msg, position=(None, None)):
        58         assert msg
        59         self.msg = msg
        60         self.lineno = position[0]
        61         self.offset = position[1]
        62 
        63     def __str__(self):
        64         result = self.msg
        65         if self.lineno is not None:
        66             result = result + ", at line %d" % self.lineno
        67         if self.offset is not None:
        68             result = result + ", column %d" % (self.offset + 1)
        69         return result
        70 
        71 
        72 class HTMLParser(markupbase.ParserBase):
        73     """Find tags and other markup and call handler functions.
        74 
        75     Usage:
        76         p = HTMLParser()
        77         p.feed(data)
        78         ...
        79         p.close()
        80 
        81     Start tags are handled by calling self.handle_starttag() or
        82     self.handle_startendtag(); end tags by self.handle_endtag().  The
        83     data between tags is passed from the parser to the derived class
        84     by calling self.handle_data() with the data as argument (the data
        85     may be split up in arbitrary chunks).  Entity references are
        86     passed by calling self.handle_entityref() with the entity
        87     reference as the argument.  Numeric character references are
        88     passed to self.handle_charref() with the string containing the
        89     reference as the argument.
        90     """
        91 
        92     CDATA_CONTENT_ELEMENTS = ("script", "style")
        93 
        94 
        95     def __init__(self):
        96         """Initialize and reset this instance."""
        97         self.reset()
        98 
        99     def reset(self):
       100         """Reset this instance.  Loses all unprocessed data."""
       101         self.rawdata = ''
       102         self.lasttag = '???'
       103         self.interesting = interesting_normal
       104         self.cdata_elem = None
       105         markupbase.ParserBase.reset(self)
       106 
       107     def feed(self, data):
       108         r"""Feed data to the parser.
       109 
       110         Call this as often as you want, with as little or as much text
       111         as you want (may include '\n').
       112         """
       113         self.rawdata = self.rawdata + data
       114         self.goahead(0)
       115 
       116     def close(self):
       117         """Handle any buffered data."""
       118         self.goahead(1)
       119 
       120     def error(self, message):
       121         raise HTMLParseError(message, self.getpos())
       122 
       123     __starttag_text = None
       124 
       125     def get_starttag_text(self):
       126         """Return full source of start tag: '<...>'."""
       127         return self.__starttag_text
       128 
       129     def set_cdata_mode(self, elem):
       130         self.cdata_elem = elem.lower()
       131         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
       132 
       133     def clear_cdata_mode(self):
       134         self.interesting = interesting_normal
       135         self.cdata_elem = None
       136 
       137     # Internal -- handle data as far as reasonable.  May leave state
       138     # and data to be processed by a subsequent call.  If 'end' is
       139     # true, force handling all data as if followed by EOF marker.
       140     def goahead(self, end):
       141         rawdata = self.rawdata
       142         i = 0
       143         n = len(rawdata)
       144         while i < n:
       145             match = self.interesting.search(rawdata, i) # < or &
       146             if match:
       147                 j = match.start()
       148             else:
       149                 if self.cdata_elem:
       150                     break
       151                 j = n
       152             if i < j: self.handle_data(rawdata[i:j])
       153             i = self.updatepos(i, j)
       154             if i == n: break
       155             startswith = rawdata.startswith
       156             if startswith('<', i):
       157                 if starttagopen.match(rawdata, i): # < + letter
       158                     k = self.parse_starttag(i)
       159                 elif startswith("</", i):
       160                     k = self.parse_endtag(i)
       161                 elif startswith("<!--", i):
       162                     k = self.parse_comment(i)
       163                 elif startswith("<?", i):
       164                     k = self.parse_pi(i)
       165                 elif startswith("<!", i):
       166                     k = self.parse_html_declaration(i)
       167                 elif (i + 1) < n:
       168                     self.handle_data("<")
       169                     k = i + 1
       170                 else:
       171                     break
       172                 if k < 0:
       173                     if not end:
       174                         break
       175                     k = rawdata.find('>', i + 1)
       176                     if k < 0:
       177                         k = rawdata.find('<', i + 1)
       178                         if k < 0:
       179                             k = i + 1
       180                     else:
       181                         k += 1
       182                     self.handle_data(rawdata[i:k])
       183                 i = self.updatepos(i, k)
       184             elif startswith("&#", i):
       185                 match = charref.match(rawdata, i)
       186                 if match:
       187                     name = match.group()[2:-1]
       188                     self.handle_charref(name)
       189                     k = match.end()
       190                     if not startswith(';', k-1):
       191                         k = k - 1
       192                     i = self.updatepos(i, k)
       193                     continue
       194                 else:
       195                     if ";" in rawdata[i:]: #bail by consuming &#
       196                         self.handle_data(rawdata[0:2])
       197                         i = self.updatepos(i, 2)
       198                     break
       199             elif startswith('&', i):
       200                 match = entityref.match(rawdata, i)
       201                 if match:
       202                     name = match.group(1)
       203                     self.handle_entityref(name)
       204                     k = match.end()
       205                     if not startswith(';', k-1):
       206                         k = k - 1
       207                     i = self.updatepos(i, k)
       208                     continue
       209                 match = incomplete.match(rawdata, i)
       210                 if match:
       211                     # match.group() will contain at least 2 chars
       212                     if end and match.group() == rawdata[i:]:
       213                         self.error("EOF in middle of entity or char ref")
       214                     # incomplete
       215                     break
       216                 elif (i + 1) < n:
       217                     # not the end of the buffer, and can't be confused
       218                     # with some other construct
       219                     self.handle_data("&")
       220                     i = self.updatepos(i, i + 1)
       221                 else:
       222                     break
       223             else:
       224                 assert 0, "interesting.search() lied"
       225         # end while
       226         if end and i < n and not self.cdata_elem:
       227             self.handle_data(rawdata[i:n])
       228             i = self.updatepos(i, n)
       229         self.rawdata = rawdata[i:]
       230 
       231     # Internal -- parse html declarations, return length or -1 if not terminated
       232     # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
       233     # See also parse_declaration in _markupbase
       234     def parse_html_declaration(self, i):
       235         rawdata = self.rawdata
       236         if rawdata[i:i+2] != '<!':
       237             self.error('unexpected call to parse_html_declaration()')
       238         if rawdata[i:i+4] == '<!--':
       239             # this case is actually already handled in goahead()
       240             return self.parse_comment(i)
       241         elif rawdata[i:i+3] == '<![':
       242             return self.parse_marked_section(i)
       243         elif rawdata[i:i+9].lower() == '<!doctype':
       244             # find the closing >
       245             gtpos = rawdata.find('>', i+9)
       246             if gtpos == -1:
       247                 return -1
       248             self.handle_decl(rawdata[i+2:gtpos])
       249             return gtpos+1
       250         else:
       251             return self.parse_bogus_comment(i)
       252 
       253     # Internal -- parse bogus comment, return length or -1 if not terminated
       254     # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
       255     def parse_bogus_comment(self, i, report=1):
       256         rawdata = self.rawdata
       257         if rawdata[i:i+2] not in ('<!', '</'):
       258             self.error('unexpected call to parse_comment()')
       259         pos = rawdata.find('>', i+2)
       260         if pos == -1:
       261             return -1
       262         if report:
       263             self.handle_comment(rawdata[i+2:pos])
       264         return pos + 1
       265 
       266     # Internal -- parse processing instr, return end or -1 if not terminated
       267     def parse_pi(self, i):
       268         rawdata = self.rawdata
       269         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
       270         match = piclose.search(rawdata, i+2) # >
       271         if not match:
       272             return -1
       273         j = match.start()
       274         self.handle_pi(rawdata[i+2: j])
       275         j = match.end()
       276         return j
       277 
       278     # Internal -- handle starttag, return end or -1 if not terminated
       279     def parse_starttag(self, i):
       280         self.__starttag_text = None
       281         endpos = self.check_for_whole_start_tag(i)
       282         if endpos < 0:
       283             return endpos
       284         rawdata = self.rawdata
       285         self.__starttag_text = rawdata[i:endpos]
       286 
       287         # Now parse the data between i+1 and j into a tag and attrs
       288         attrs = []
       289         match = tagfind.match(rawdata, i+1)
       290         assert match, 'unexpected call to parse_starttag()'
       291         k = match.end()
       292         self.lasttag = tag = match.group(1).lower()
       293 
       294         while k < endpos:
       295             m = attrfind.match(rawdata, k)
       296             if not m:
       297                 break
       298             attrname, rest, attrvalue = m.group(1, 2, 3)
       299             if not rest:
       300                 attrvalue = None
       301             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
       302                  attrvalue[:1] == '"' == attrvalue[-1:]:
       303                 attrvalue = attrvalue[1:-1]
       304             if attrvalue:
       305                 attrvalue = self.unescape(attrvalue)
       306             attrs.append((attrname.lower(), attrvalue))
       307             k = m.end()
       308 
       309         end = rawdata[k:endpos].strip()
       310         if end not in (">", "/>"):
       311             lineno, offset = self.getpos()
       312             if "\n" in self.__starttag_text:
       313                 lineno = lineno + self.__starttag_text.count("\n")
       314                 offset = len(self.__starttag_text) \
       315                          - self.__starttag_text.rfind("\n")
       316             else:
       317                 offset = offset + len(self.__starttag_text)
       318             self.handle_data(rawdata[i:endpos])
       319             return endpos
       320         if end.endswith('/>'):
       321             # XHTML-style empty tag: <span attr="value" />
       322             self.handle_startendtag(tag, attrs)
       323         else:
       324             self.handle_starttag(tag, attrs)
       325             if tag in self.CDATA_CONTENT_ELEMENTS:
       326                 self.set_cdata_mode(tag)
       327         return endpos
       328 
       329     # Internal -- check to see if we have a complete starttag; return end
       330     # or -1 if incomplete.
       331     def check_for_whole_start_tag(self, i):
       332         rawdata = self.rawdata
       333         m = locatestarttagend.match(rawdata, i)
       334         if m:
       335             j = m.end()
       336             next = rawdata[j:j+1]
       337             if next == ">":
       338                 return j + 1
       339             if next == "/":
       340                 if rawdata.startswith("/>", j):
       341                     return j + 2
       342                 if rawdata.startswith("/", j):
       343                     # buffer boundary
       344                     return -1
       345                 # else bogus input
       346                 self.updatepos(i, j + 1)
       347                 self.error("malformed empty start tag")
       348             if next == "":
       349                 # end of input
       350                 return -1
       351             if next in ("abcdefghijklmnopqrstuvwxyz=/"
       352                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
       353                 # end of input in or before attribute value, or we have the
       354                 # '/' from a '/>' ending
       355                 return -1
       356             if j > i:
       357                 return j
       358             else:
       359                 return i + 1
       360         raise AssertionError("we should not get here!")
       361 
       362     # Internal -- parse endtag, return end or -1 if incomplete
       363     def parse_endtag(self, i):
       364         rawdata = self.rawdata
       365         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
       366         match = endendtag.search(rawdata, i+1) # >
       367         if not match:
       368             return -1
       369         gtpos = match.end()
       370         match = endtagfind.match(rawdata, i) # </ + tag + >
       371         if not match:
       372             if self.cdata_elem is not None:
       373                 self.handle_data(rawdata[i:gtpos])
       374                 return gtpos
       375             # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
       376             namematch = tagfind_tolerant.match(rawdata, i+2)
       377             if not namematch:
       378                 # w3.org/TR/html5/tokenization.html#end-tag-open-state
       379                 if rawdata[i:i+3] == '</>':
       380                     return i+3
       381                 else:
       382                     return self.parse_bogus_comment(i)
       383             tagname = namematch.group().lower()
       384             # consume and ignore other stuff between the name and the >
       385             # Note: this is not 100% correct, since we might have things like
       386             # </tag attr=">">, but looking for > after tha name should cover
       387             # most of the cases and is much simpler
       388             gtpos = rawdata.find('>', namematch.end())
       389             self.handle_endtag(tagname)
       390             return gtpos+1
       391 
       392         elem = match.group(1).lower() # script or style
       393         if self.cdata_elem is not None:
       394             if elem != self.cdata_elem:
       395                 self.handle_data(rawdata[i:gtpos])
       396                 return gtpos
       397 
       398         self.handle_endtag(elem)
       399         self.clear_cdata_mode()
       400         return gtpos
       401 
       402     # Overridable -- finish processing of start+end tag: <tag.../>
       403     def handle_startendtag(self, tag, attrs):
       404         self.handle_starttag(tag, attrs)
       405         self.handle_endtag(tag)
       406 
       407     # Overridable -- handle start tag
       408     def handle_starttag(self, tag, attrs):
       409         pass
       410 
       411     # Overridable -- handle end tag
       412     def handle_endtag(self, tag):
       413         pass
       414 
       415     # Overridable -- handle character reference
       416     def handle_charref(self, name):
       417         pass
       418 
       419     # Overridable -- handle entity reference
       420     def handle_entityref(self, name):
       421         pass
       422 
       423     # Overridable -- handle data
       424     def handle_data(self, data):
       425         pass
       426 
       427     # Overridable -- handle comment
       428     def handle_comment(self, data):
       429         pass
       430 
       431     # Overridable -- handle declaration
       432     def handle_decl(self, decl):
       433         pass
       434 
       435     # Overridable -- handle processing instruction
       436     def handle_pi(self, data):
       437         pass
       438 
       439     def unknown_decl(self, data):
       440         pass
       441 
       442     # Internal -- helper to remove special character quoting
       443     entitydefs = None
       444     def unescape(self, s):
       445         if '&' not in s:
       446             return s
       447         def replaceEntities(s):
       448             s = s.groups()[0]
       449             try:
       450                 if s[0] == "#":
       451                     s = s[1:]
       452                     if s[0] in ['x','X']:
       453                         c = int(s[1:], 16)
       454                     else:
       455                         c = int(s)
       456                     return unichr(c)
       457             except ValueError:
       458                 return '&#'+s+';'
       459             else:
       460                 # Cannot use name2codepoint directly, because HTMLParser supports apos,
       461                 # which is not part of HTML 4
       462                 import htmlentitydefs
       463                 if HTMLParser.entitydefs is None:
       464                     entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
       465                     for k, v in htmlentitydefs.name2codepoint.iteritems():
       466                         entitydefs[k] = unichr(v)
       467                 try:
       468                     return self.entitydefs[s]
       469                 except KeyError:
       470                     return '&'+s+';'
       471 
       472         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)
  • 相关阅读:
    Python爬虫基础之BeautifulSoup
    Python爬虫基础之requests
    Python爬虫基础之UrlError
    Python爬虫基础之Cookie
    module 'socket' has no attribute的解决方案
    ECS——CentOS7下使用yum安装MariaDB
    用 Python 定位特定类型文件
    django——文本编辑器
    解决文件路径的问题的总结
    Django——发送邮件
  • 原文地址:https://www.cnblogs.com/hzhida/p/2635473.html
Copyright © 2011-2022 走看看