zoukankan      html  css  js  c++  java
  • python模块之HTMLParser抓页面上的所有URL链接

    # -*- coding: utf-8 -*-
    #python 27
    #xiaodeng
    #python模块之HTMLParser抓页面上的所有URL链接
    
    
    import urllib
    #MyParser类写法一
    '''
    from HTMLParser import HTMLParser
    class MyParser(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)
           
        def handle_starttag(self, tag, attrs):
            if tag == 'a':
                for name,value in attrs:
                    if name == 'href' and value.startswith('http'):
                        print value
    
    '''
    
    #MyParser类写法二
    import HTMLParser
    class MyParser(HTMLParser.HTMLParser):
           
        def handle_starttag(self, tag, attrs):
         #这里重新定义了处理开始标签的函数
    if tag == 'a':#判断标签<a>的属性 for name,value in attrs: if name == 'href' and value.startswith('http'):#以什么字符串开头 print value if __name__ == '__main__': url='http://www.cnblogs.com/' content=urllib.urlopen(url).read() my=MyParser() my.feed(content) my.close() ''' http://www.cnblogs.com/Jaryleely/p/careertwo.html http://www.cnblogs.com/Jaryleely/ http://www.cnblogs.com/Jaryleely/ http://www.cnblogs.com/Jaryleely/p/careertwo.html#commentform http://www.cnblogs.com/Jaryleely/p/careertwo.html http://www.cnblogs.com/AndroidJotting/p/4983688.html http://www.cnblogs.com/AndroidJotting/ http://www.cnblogs.com/AndroidJotting/ http://www.cnblogs.com/AndroidJotting/p/4983688.html#commentform http://www.cnblogs.com/AndroidJotting/p/4983688.html http://www.cnblogs.com/fuly550871915/p/4983682.html http://www.cnblogs.com/fuly550871915/ http://www.cnblogs.com/fuly550871915/ http://www.cnblogs.com/fuly550871915/p/4983682.html#commentform http://www.cnblogs.com/fuly550871915/p/4983682.html http://www.cnblogs.com/Ray-liang/p/4983592.html http://www.cnblogs.com/Ray-liang/ http://www.cnblogs.com/Ray-liang/ http://www.cnblogs.com/Ray-liang/p/4983592.html#commentform http://www.cnblogs.com/Ray-liang/p/4983592.html ....... '''
  • 相关阅读:
    CF1174D Ehab and the Expected XOR Problem
    CF1083B The Fair Nut and Strings
    CF1088D Ehab and another another xor problem
    CF1168A Increasing by Modulo
    CF1166C A Tale of Two Lands
    CF1142A The Beatles
    CF1105D Kilani and the Game
    【uva11248】网络扩容
    【sam复习】用sam实现后缀排序
    【Educational Codeforces Round 19】
  • 原文地址:https://www.cnblogs.com/dengyg200891/p/4983746.html
Copyright © 2011-2022 走看看