zoukankan      html  css  js  c++  java
  • Python3中urllib详细使用方法(header,代理,超时,认证,异常处理)

    python3 抓取网页资源的 N 种方法

    1、最简单

    import urllib.request
    response = urllib.request.urlopen('http://python.org/')
    html = response.read()

    2、使用 Request

    import urllib.request

    req = urllib.request.Request('http://python.org/')
    response = urllib.request.urlopen(req)
    the_page = response.read()

    3、发送数据

    #! /usr/bin/env python3

    import urllib.parse
    import urllib.request

    url = 'http://localhost/login.php'
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    values = {
    'act' : 'login',
    'login[email]' : 'yzhang@i9i8.com',
    'login[password]' : '123456'
    }

    data = urllib.parse.urlencode(values)
    req = urllib.request.Request(url, data)
    req.add_header('Referer', 'http://www.python.org/')
    response = urllib.request.urlopen(req)
    the_page = response.read()

    print(the_page.decode("utf8"))

    4、发送数据和header

    #! /usr/bin/env python3

    import urllib.parse
    import urllib.request

    url = 'http://localhost/login.php'
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    values = {
    'act' : 'login',
    'login[email]' : 'yzhang@i9i8.com',
    'login[password]' : '123456'
    }
    headers = { 'User-Agent' : user_agent }

    data = urllib.parse.urlencode(values)
    req = urllib.request.Request(url, data, headers)
    response = urllib.request.urlopen(req)
    the_page = response.read()

    print(the_page.decode("utf8"))

    5、http 错误

    #! /usr/bin/env python3

    import urllib.request

    req = urllib.request.Request('http://www.111cn.net ')
    try:
    urllib.request.urlopen(req)
    except urllib.error.HTTPError as e:
    print(e.code)
    print(e.read().decode("utf8"))

    6、异常处理1

    #! /usr/bin/env python3

    from urllib.request import Request, urlopen
    from urllib.error import URLError, HTTPError
    req = Request("http://www.111cn.net /")
    try:
    response = urlopen(req)
    except HTTPError as e:
    print('The server couldn't fulfill the request.')
    print('Error code: ', e.code)
    except URLError as e:
    print('We failed to reach a server.')
    print('Reason: ', e.reason)
    else:
    print("good!")
    print(response.read().decode("utf8"))

    7、异常处理2

    #! /usr/bin/env python3

    from urllib.request import Request, urlopen
    from urllib.error import  URLError
    req = Request("http://www.111cn.net /")
    try:
    response = urlopen(req)
    except URLError as e:
    if hasattr(e, 'reason'):
    print('We failed to reach a server.')
    print('Reason: ', e.reason)
    elif hasattr(e, 'code'):
    print('The server couldn't fulfill the request.')
    print('Error code: ', e.code)
    else:
    print("good!")
    print(response.read().decode("utf8"))

    8、HTTP 认证

    #! /usr/bin/env python3

    import urllib.request

    # create a password manager
    password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()

    # Add the username and password.
    # If we knew the realm, we could use it instead of None.
    top_level_url = "https://www.111cn.net /"
    password_mgr.add_password(None, top_level_url, 'rekfan', 'xxxxxx')

    handler = urllib.request.HTTPBasicAuthHandler(password_mgr)

    # create "opener" (OpenerDirector instance)
    opener = urllib.request.build_opener(handler)

    # use the opener to fetch a URL
    a_url = "https://www.111cn.net /"
    x = opener.open(a_url)
    print(x.read())

    # Install the opener.
    # Now all calls to urllib.request.urlopen use our opener.
    urllib.request.install_opener(opener)

    a = urllib.request.urlopen(a_url).read().decode('utf8')
    print(a)

    9、使用代理

    #! /usr/bin/env python3

    import urllib.request

    proxy_support = urllib.request.ProxyHandler({'sock5': 'localhost:1080'})
    opener = urllib.request.build_opener(proxy_support)
    urllib.request.install_opener(opener)


    a = urllib.request.urlopen("http://www.111cn.net ").read().decode("utf8")
    print(a)

    10、超时

    #! /usr/bin/env python3

    import socket
    import urllib.request

    # timeout in seconds
    timeout = 2
    socket.setdefaulttimeout(timeout)

    # this call to urllib.request.urlopen now uses the default timeout
    # we have set in the socket module
    req = urllib.request.Request('http://www.111cn.net /')
    a = urllib.request.urlopen(req).read()
    print(a)

  • 相关阅读:
    给出两个 非空 的链表用来表示两个非负的整数。其中,它们各自的位数是按照 逆序 的方式存储的,并且它们的每个节点只能存储 一位 数字。
    11
    实战 迁移学习 VGG19、ResNet50、InceptionV3 实践 猫狗大战 问题
    tx2系统备份与恢复
    如何在Ubuntu 18.04上安装和卸载TeamViewer
    bzoj 3732 Network (kruskal重构树)
    bzoj2152 聪聪可可 (树形dp)
    牛客 216D 消消乐 (二分图最小点覆盖)
    牛客 197E 01串
    Wannafly挑战赛23
  • 原文地址:https://www.cnblogs.com/onemorepoint/p/7396155.html
Copyright © 2011-2022 走看看