zoukankan      html  css  js  c++  java
  • urlib2 标准代码

    import urllib2
    def downloadHtml(url,user_agent=None,num_retries=2):
        print 'Downloading:',url
        headers={'User-agent':user_agent}
        req=urllib2.Request(url,headers=headers)
        try:
            html=urllib2.urlopen(req).read()
        except urllib2.URLError as e:
            print 'Download error:',e.reason
            html=None
            if num_retries>0:
                if hasattr(e,'code') and 500<=e.code<600:
                    return downloadHtml(url,user_agent,num_retries-1) 
        return html 
    
    def download_id():##根据连续页码下载若连续5次出错停止下载
        max_count=5
        error_count=0
        for i in itertools.count(1):
            url='http://xxxx/%s'%i
            html=download(url)
            if html is None:
                error_count+=1
                if error_count==max_count:
                    break
            else:
                error_count=0
    
    def get_links(html):
        reg=re.compile(r'',re.S)
        return reg.findall(html)
    
    def link_crawler(seed_url, link_regex):
        crawl_queue=[seed_url]
        seen=set(crawl_queue)
        while crawl_queue:
            url=crawl_queue.pop()
            html=download(url)
            for link in get_links(html):
                if re.match(link_regex,link):
                    link=urlparse.urljoin(seed_url,link)
                    if link not in seen:
                        seen.add(link)
                        crawl_queue.append(link)

      

  • 相关阅读:
    JPA与Hibernate的关系
    EJB里的问题解答
    EJB与JPA的关系
    EJB的魅惑来源
    EJB简介
    MyEclipse如何恢复删掉的文件
    EasyUI的功能树之扁平化
    EasyUI的功能树之异步树
    Spring AOP的日志记录
    简单理解IoC与DI
  • 原文地址:https://www.cnblogs.com/howhy/p/7403882.html
Copyright © 2011-2022 走看看