# -*- coding: utf-8 -*- #python 27 #xiaodeng #python模块之HTMLParser(原理很大程度上就是对类构造的熟练运用) import HTMLParser #tag是的html标签,attrs是 (属性,值)元组(tuple)的列表(list)。 #HTMLParser自动将tag和attrs都转为小写 ''' >>> help(HTMLParser) Help on module HTMLParser: CLASSES exceptions.Exception(exceptions.BaseException) HTMLParseError markupbase.ParserBase HTMLParser class HTMLParser(markupbase.ParserBase) | Find tags and other markup and call handler functions. | | Usage: | p = HTMLParser()#初始化 | p.feed(data)#feed()方法可以多次调用,也就是不一定一次把整个HTML字符串都塞进去,可以一部分一部分塞进去 #提供一些文本给解析器。在由完整元素组成的限度内进行处理,不完整的数据被缓冲直到更多的数据提供或者close()被调用 | ... | p.close() | | Methods defined here: | | __init__(self) | Initialize and reset this instance. | | check_for_whole_start_tag(self, i) | # Internal -- check to see if we have a complete starttag; return end | # or -1 if incomplete. | | clear_cdata_mode(self) | | close(self) | Handle any buffered data. | | error(self, message) | | feed(self, data) #向分析器提供数据。 | Feed data to the parser. | | Call this as often as you want, with as little or as much text | as you want (may include ' '). | | get_starttag_text(self) | Return full source of start tag: '<...>'. | | goahead(self, end) | # Internal -- handle data as far as reasonable. May leave state | # and data to be processed by a subsequent call. If 'end' is | # true, force handling all data as if followed by EOF marker. | | handle_charref(self, name) #处理特殊字符串,就是以&#开头的,一般是内码表示的字符 | # Overridable -- handle character reference | | handle_comment(self, data) #处理注释,处理<!--comment-->内的内容 | # Overridable -- handle comment | | handle_data(self, data) #处理数据,就是<xx>data</xx>中间的那些数据 | # Overridable -- handle data | | handle_decl(self, decl) #处理<!开头的,比如<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" | #文档类型声明, # Overridable -- handle declaration | | handle_endtag(self, tag) #处理结束标签,</xx> | # Overridable -- handle end tag | | handle_entityref(self, name) #处理一些特殊字符,以&开头的 | # Overridable -- handle entity reference | | handle_pi(self, data) #处理形如<?instruction>的东西 | # Overridable -- handle processing instruction | | handle_startendtag(self, tag, attrs) #处理开始标签和结束标签 | # Overridable -- finish processing of start+end tag: <tag.../> | | handle_starttag(self, tag, attrs) # 处理开始标签,比如<xx> | # Overridable -- handle start tag | | parse_bogus_comment(self, i, report=1) | # Internal -- parse bogus comment, return length or -1 if not terminated | # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state | | parse_endtag(self, i) | # Internal -- parse endtag, return end or -1 if incomplete | | parse_html_declaration(self, i) | # Internal -- parse html declarations, return length or -1 if not terminated | # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state | # See also parse_declaration in _markupbase | | parse_pi(self, i) | # Internal -- parse processing instr, return end or -1 if not terminated | | parse_starttag(self, i) | # Internal -- handle starttag, return end or -1 if not terminated | | reset(self) | Reset this instance. Loses all unprocessed data. | | set_cdata_mode(self, elem) | | unescape(self, s) | | unknown_decl(self, data) | | ---------------------------------------------------------------------- | Data and other attributes defined here: | | CDATA_CONTENT_ELEMENTS = ('script', 'style') | | entitydefs = None | | ---------------------------------------------------------------------- | Methods inherited from markupbase.ParserBase: | | getpos(self) | Return current line number and offset. | | parse_comment(self, i, report=1) | # Internal -- parse comment, return length or -1 if not terminated | | parse_declaration(self, i) | # Internal -- parse declaration (for use by subclasses). | | parse_marked_section(self, i, report=1) | # Internal -- parse a marked section | # Override this to handle MS-word extension syntax <![if word]>content<![endif]> | | updatepos(self, i, j) | # Internal -- update line number and offset. This should be | # called for each piece of data exactly once, in order -- in other | # words the concatenation of all the input strings to this | # function should be exactly the entire input. >>> '''