zoukankan      html  css  js  c++  java
  • 提取网址的python练习

    import urllib, urllib2, cookielib
    from HTMLParser import HTMLParser
    import sys
    
    reload(sys)
    sys.setdefaultencoding('utf8')
    
    class WebParser(HTMLParser):
        def __init__(self, links, path):
            HTMLParser.__init__(self)
            self.links = links
            self.path = path
    
        def handle_starttag(self, tag, attrs):
            if tag == 'a':
                if len(attrs) == 0:
                    pass
                else:
                    for (key, val) in attrs:
                        if key == 'href':
                            if val.startswith('http'):
                                self.links.add(val)
                            elif val.startswith('/'):
                                self.links.add(self.path + val)
    
    class Crawl:
        def __init__(self):
            self.path = 'http://www.baidu.com'
            self.cookie = cookielib.CookieJar()
            handler = urllib2.HTTPCookieProcessor(self.cookie)
            self.opener = urllib2.build_opener(handler)
    
        def open(self, path):
            self.response = self.opener.open(path)
    
        def showCookie(self):
            for item in self.cookie:
                print 'Name = ' + item.name
                print 'value = ' + item.value
    
        def showResponse(self):
            print self.response.read()
    
        def getAllUrl(self, links, path):
            try:
                self.open(path)
                res = self.response.read()
                parser = WebParser(links, path)
                parser.feed(res)
                parser.close()
            except Exception, e:
                print e
    
        def crawl(self):
            src_links = set()
            result_links = set()
            self.getAllUrl(src_links, self.path)
            n = 200
            while len(src_links) != 0 and n > 0:
                link = src_links.pop()
                if link in result_links:
                    pass
                result_links.add(link)
                self.getAllUrl(src_links, link)
                n -= 1
                print n
    
            return result_links | src_links
    
    c = Crawl()
    rlt = c.crawl()
    for link in rlt:
        print link
  • 相关阅读:
    基础数据类型
    python2x与python3x区别(30个)更新中。。。
    注释
    常量
    变量
    十、SpringCloud config分布式配置中心
    九、Gateway新一代网关
    八、Hystrix断路器(下)
    八、Hystrix断路器(上)
    七、OpenFeign服务接口调用
  • 原文地址:https://www.cnblogs.com/hushpa/p/4671144.html
Copyright © 2011-2022 走看看