zoukankan      html  css  js  c++  java
  • 1.4.3 ID遍历爬虫(每天一更)

    # -*- coding: utf-8 -*-
    '''
    Created on 2019年5月7日
    
    @author: 薛卫卫
    '''
    import itertools
    import urllib.request
    import re
    
    def download(url, user_agent="wswp",num_retries=2):
        print("Downloading: " , url)
        headers = { 'User-agent': user_agent}
        request = urllib.request.Request(url, headers=headers)
        try:
            html = urllib.request.urlopen(request).read()
        except urllib.request.URLError as e:
            print('Download error:' , e.reason)
            html = None
            if num_retries > 0 :
                if hasattr(e, 'code') and 500 <= e.code < 600:
                    return download(url, user_agent, num_retries-1)
        return html
    
    for page in itertools.count(1):
        url = 'http://example.webscraping.com/view/-%d' % page
        html = download(url)
        if html is None:
            break
        else:
            # success - can scrape the result
            pass
        
    #     
    # # maximum number of consecutive download errors allowed
    # max_error = 5
    # # current number of consecutive download errors
    # nun_errors = 0
    # for page in itertools.count(1):
    #     url = 'http://example.webcraping.com/view/-%d' % page
    #     html = download(url)
    #     if html is None:
    #         # received an error trying to download this webpage
    #         num_errors +=1
    #         if num_errors == max_errors:
    #             # reached maxinum number of 
    #             # consecutive errors so exit
    #             break
    #         else:
    #             # success - can scrape the result
    #             # ...
    #             num_errors = 0
    

      

  • 相关阅读:
    自己编译linux内核
    codeblocks中文输入及控制台乱码
    c快速学习代码
    二分法求方程解
    苹果公司前任首席执行官乔布斯语录
    html5特性
    lamp server快速搭建
    poj 1247 Magnificent Meatballs 解题报告
    Ural 1005 Stone Pilet 解题报告
    Ural 1020 Rope 解题报告
  • 原文地址:https://www.cnblogs.com/xww115/p/10835223.html
Copyright © 2011-2022 走看看