zoukankan html css js c++ java

python 多功能下载网页

#下载网页
#具有功能：捕获异常，重试下载并设置用户代理
import urllib.request
import urllib.error
#下载网页
#wscp:默认用户代理 web scraping with python 缩写
def download(url, user_agent='wscp',num_retries=2):
    print('Downloading:',url) #打印下载路径
    headers={'USer-Agent':user_agent}
    request=urllib.request.Request(url,headers=headers)
    try:
        html=urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('download error:',e.reason)
        html=None
        if num_retries>0:#下载遇到错误时尝试下载
            if hasattr(e,'code') and 500 <=e.code <600:#404 notfound 这种错误，说明网页不存在，故不需要重新下载
                print(user_agent)
                return download(url,user_agent,num_retries-1)

    return html
download('http://example.webscraping.com/')
download('http://httpstat.us/500')#测试错误500
# print(dir(urllib))

Downloading: http://example.webscraping.com/
Downloading: http://httpstat.us/500
download error: Internal Server Error
wscp
Downloading: http://httpstat.us/500
download error: Internal Server Error
wscp
Downloading: http://httpstat.us/500
download error: Internal Server Error

查看全文

相关阅读:
【LeetCode每天一题】Rotate List(旋转链表)
【LeetCode每天一题】Permutation Sequence(排列序列)
【LeetCode每天一题】Length of Last Word(字符串中最后一个单词的长度)
【LeetCode每天一题】Merge Intervals(合并区间)
【LeetCode每天一题】Spiral Matrix II(螺旋数组II)
Ajax基础
 git的命令行操作
 新闻发布系统之登录和注销
 JSTL和EL
servlet模板的修改

原文地址：https://www.cnblogs.com/liangliangzz/p/10160482.html