zoukankan      html  css  js  c++  java
  • python模块之HTMLParser(原理很大程度上就是对类构造的熟练运用)

    # -*- coding: utf-8 -*-
    #python 27
    #xiaodeng
    #python模块之HTMLParser(原理很大程度上就是对类构造的熟练运用)
    
    
    import HTMLParser
    #tag是的html标签,attrs是 (属性,值)元组(tuple)的列表(list)。
    #HTMLParser自动将tag和attrs都转为小写
    
    
    '''
    >>> help(HTMLParser)
    Help on module HTMLParser:
    CLASSES
        exceptions.Exception(exceptions.BaseException)
            HTMLParseError
        markupbase.ParserBase
            HTMLParser
        
        class HTMLParser(markupbase.ParserBase)
         |  Find tags and other markup and call handler functions.
         |  
         |  Usage:
         |      p = HTMLParser()#初始化
         |      p.feed(data)#feed()方法可以多次调用,也就是不一定一次把整个HTML字符串都塞进去,可以一部分一部分塞进去
                            #提供一些文本给解析器。在由完整元素组成的限度内进行处理,不完整的数据被缓冲直到更多的数据提供或者close()被调用
         |      ...
         |      p.close()
         |  
         |  Methods defined here:
         |  
         |  __init__(self)
         |      Initialize and reset this instance.
         |  
         |  check_for_whole_start_tag(self, i)
         |      # Internal -- check to see if we have a complete starttag; return end
         |      # or -1 if incomplete.
         |  
         |  clear_cdata_mode(self)
         |  
         |  close(self)
         |      Handle any buffered data.
         |  
         |  error(self, message)
         |  
         |  feed(self, data)            #向分析器提供数据。
         |      Feed data to the parser.
         |      
         |      Call this as often as you want, with as little or as much text
         |      as you want (may include '
    ').
         |  
         |  get_starttag_text(self)
         |      Return full source of start tag: '<...>'.
         |  
         |  goahead(self, end)
         |      # Internal -- handle data as far as reasonable.  May leave state
         |      # and data to be processed by a subsequent call.  If 'end' is
         |      # true, force handling all data as if followed by EOF marker.
         |  
         |  handle_charref(self, name)              #处理特殊字符串,就是以&#开头的,一般是内码表示的字符
         |      # Overridable -- handle character reference
         |  
         |  handle_comment(self, data)              #处理注释,处理<!--comment-->内的内容
         |      # Overridable -- handle comment
         |  
         |  handle_data(self, data)                 #处理数据,就是<xx>data</xx>中间的那些数据
         |      # Overridable -- handle data
         |  
         |  handle_decl(self, decl)                 #处理<!开头的,比如<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
         |                                          #文档类型声明,
                 # Overridable -- handle declaration
         |  
         |  handle_endtag(self, tag)                #处理结束标签,</xx>
         |      # Overridable -- handle end tag
         |  
         |  handle_entityref(self, name)            #处理一些特殊字符,以&开头的
         |      # Overridable -- handle entity reference
         |  
         |  handle_pi(self, data)                   #处理形如<?instruction>的东西
         |      # Overridable -- handle processing instruction
         |  
         |  handle_startendtag(self, tag, attrs)    #处理开始标签和结束标签
         |      # Overridable -- finish processing of start+end tag: <tag.../>
         |  
         |  handle_starttag(self, tag, attrs)       # 处理开始标签,比如<xx>
         |      # Overridable -- handle start tag
         |  
         |  parse_bogus_comment(self, i, report=1)
         |      # Internal -- parse bogus comment, return length or -1 if not terminated
         |      # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
         |  
         |  parse_endtag(self, i)                   
         |      # Internal -- parse endtag, return end or -1 if incomplete
         |  
         |  parse_html_declaration(self, i)
         |      # Internal -- parse html declarations, return length or -1 if not terminated
         |      # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
         |      # See also parse_declaration in _markupbase
         |  
         |  parse_pi(self, i)
         |      # Internal -- parse processing instr, return end or -1 if not terminated
         |  
         |  parse_starttag(self, i)
         |      # Internal -- handle starttag, return end or -1 if not terminated
         |  
         |  reset(self)
         |      Reset this instance.  Loses all unprocessed data.
         |  
         |  set_cdata_mode(self, elem)
         |  
         |  unescape(self, s)
         |  
         |  unknown_decl(self, data)
         |  
         |  ----------------------------------------------------------------------
         |  Data and other attributes defined here:
         |  
         |  CDATA_CONTENT_ELEMENTS = ('script', 'style')
         |  
         |  entitydefs = None
         |  
         |  ----------------------------------------------------------------------
         |  Methods inherited from markupbase.ParserBase:
         |  
         |  getpos(self)
         |      Return current line number and offset.
         |  
         |  parse_comment(self, i, report=1)
         |      # Internal -- parse comment, return length or -1 if not terminated
         |  
         |  parse_declaration(self, i)
         |      # Internal -- parse declaration (for use by subclasses).
         |  
         |  parse_marked_section(self, i, report=1)
         |      # Internal -- parse a marked section
         |      # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
         |  
         |  updatepos(self, i, j)
         |      # Internal -- update line number and offset.  This should be
         |      # called for each piece of data exactly once, in order -- in other
         |      # words the concatenation of all the input strings to this
         |      # function should be exactly the entire input.
    
    >>> 
    '''
  • 相关阅读:
    数据库——大事务
    数据库——性能理解
    java——通过GenericObjectPool获取到的资源,调用close()方法会close还是returnObject?
    gradle——入门
    MongoDB——morphia
    jvm——Java main方法的执行
    sql——limit
    阿里云Open API自动化脚本—ECS公网IP转化弹性公网IP
    阿里云共享带宽
    mysql5.x安装脚本
  • 原文地址:https://www.cnblogs.com/dengyg200891/p/4983857.html
Copyright © 2011-2022 走看看