zoukankan      html  css  js  c++  java
  • urllib库

    python内置的最基本的HTTP请求库,有以下四个模块:

    urllib.request  请求模块

    urllib.error    异常处理模块

    urllib.parse   url解析模块

    urllib.robotparser  robots.txt解析模块

    urllib.request请求模块:

    urllib.request.urlopen(url,data=None,[timeout,]*,cafile=None,capath=None,cadefault=False,context=None)

    '''urlopen()函数'''

    import urllib.request

    response = urllib.request.urlopen("http://www.baidu.com")
    print(response.read().decode("utf-8"))    #response.read()是bytes类型的数据,要转码。

    import urllib.parse
    data = bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf-8')
    #该提交方式是post,data参数是bytes类型的键值对对象
    response = urllib.request.urlopen("http://httpbin.org/post",data=data) #专门提供做http测试的网站
    print(response.read())

    #timeout是超时响应参数

    response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)
    print(response.read())

    import socket
    import urllib.error
    try:
    urllib.request.urlopen("http://httpbin.org/get", timeout=0.1)
    except urllib.error.URLError as e:
    if isinstance(e.reason,socket.timeout):
    print('TIME OUT')
    #响应类型
    print(type(response))

    #响应头、状态码
    response = urllib.request.urlopen("https://www.python.org")
    print(response.status) #得到响应的状态码
    print(response.getheaders()) #得到响应的Response Headers
    print(response.getheader("Server")) #根据键得到Response Headers中指定键的值


    '''Request()函数:当urlopen()要传递headers等信息时候,就要用到Request()函数,
    返回一个request对象作为urlopen()函数的一个参数。'''
    import urllib.parse
    url = "http://httpbin.org/post"
    headers = {
    # 'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
    'Host':'httpbin.org'
    }
    dict = {
    'name':'Germey'
    }
    data = bytes(urllib.parse.urlencode(dict),encoding='utf-8')
    req = urllib.request.Request(url=url,data=data,headers=headers,method='POST')
    req.add_header('User-Agent','Mozilla/4.0(compatible;MSIE 5.5;Windows NT)') #可以单独添加header
    response = urllib.request.urlopen(req)
    print(response.read().decode('utf-8'))


    '''cookie'''
    import http.cookiejar,urllib.request
    cookie = http.cookiejar.MozillaCookieJar()
    handler = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(handler)
    response = opener.open("http://www.baidu.com")
    for item in cookie:
    print(item.name + "=: " + item.value)

    #存储cookie
    filename = "cookieLWP.txt"
    cookie = http.cookiejar.LWPCookieJar(filename)
    handler = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(handler)
    response = opener.open("http://www.baidu.com")
    cookie.save(ignore_discard=True,ignore_expires=True)
    #读取cookie
    cookie = http.cookiejar.LWPCookieJar() #怎么存就怎么取
    cookie.load('cookieLWP.txt',ignore_discard=True,ignore_expires=True)
    handler = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(handler)
    response = opener.open("http://www.baidu.com")
    print(response.read().decode('utf-8'))



    urllib.error异常处理模块:
    '''异常处理'''
    from urllib import error
    try:
    response = urllib.request.urlopen("https://www.cnblogs.com/wisir/index.html")
    except error.HTTPError as e:
    print(e.reason,e.code,e.headers,sep=' ')
    except error.URLError as e:
    print(e.reason)
    else:
    print("Request Successfully")

    try:
    response = urllib.request.urlopen("https://www.baidu.com",timeout=0.01)
    except urllib.error.URLError as e:
    print(e.reason)
    if isinstance(e.reason,socket.timeout):
    print('TIME OUT')


    urllib.parse URL解析模块:
    '''urlparse'''
    # urllib.parse.urlparse(urlstring,scheme="",allow_fragments=True)
    from urllib.parse import urlparse
    result = urlparse("http://www.baidu.com/index.html;user?id=5#comment")
    print(type(result),result)

    '''urlunparse:作用与urlparse相反,是将ParseResult类型的六个参数,合成一个完整的url。'''
    from urllib.parse import urlunparse
    data = ['http','www.baidu.com','index.html','user','a=6','comment']
    print(urlunparse(data))

    '''urljoin:以第二个参数为基准,若第二个参数没有ParseResult类型六个参数中的某一个,则用第一个参数作为补充。'''
    from urllib.parse import urljoin
    print(urljoin("http://www.baidu.com","FAQ.html"))
    print(urljoin("http://www.baidu.com","https://www.cnblogs.com/wisir/"))

    '''urlencode:字典对象转换为get请求参数'''
    from urllib.parse import urlencode
    params = {
    'name':'germey',
    'age':22
    }
    base_url = "http://www.baidu.com?"
    url = base_url + urlencode(params)
    print(url)


    python3 urllib库官方文档:https://docs.python.org/3/library/urllib.html








  • 相关阅读:
    bzoj1562: [NOI2009]变换序列
    bzoj2763: [JLOI2011]飞行路线
    有上下界网络流
    bzoj3211: 花神游历各国
    bzoj3668: [Noi2014]起床困难综合症
    bzoj2743: [HEOI2012]采花
    BZOJ 1787: [Ahoi2008]Meet 紧急集合
    BZOJ 1029 [JSOI2007]建筑抢修 贪心
    BZOJ 2748 音量调节
    BZOJ 3524: [Poi2014]Couriers 主席树
  • 原文地址:https://www.cnblogs.com/wisir/p/9969833.html
Copyright © 2011-2022 走看看