zoukankan      html  css  js  c++  java
  • urllib库中常见的类和方法

     

    from urllib.request import urlopen
    from http.client import HTTPResponse

    response = urlopen('http://www.baidu.com')# http.client.HTTPResponse对象
    print(type(response))

    with response:
    print(1,response.status)
    print(2,response.reason)
    print(3,response.geturl())
    print(4,response.info()) # headers
    print(5,response.read())
    # urlopen 只能传递url和data,但是不能构造HTTP请求,所以Request类来实现

    # 初始化方法,构造一个请求对象 可以添加一个headers字典,data参数决定是GET或者POST
    # add_header(key,value)也可以为headers中增加一个键值对
    from urllib.request import Request,urlopen
    import random
    url = 'http://www.bing.com'
    ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
    ua = random.choice(ua_list)
    req = Request(url) # 返回<class 'urllib.request.Request'>类
    req.add_header('User-agent',ua)
    print('type_req',type(req))# 返回<class 'http.client.HTTPResponse'>类

    response = urlopen(req,timeout=20)
    print(type(response))

    with response:
    print(1,response.status,response.getcode(),response.reason)
    print(2,response.geturl())
    print(3,response.info())#response的headers
    print(4,response.read())
    print(5,req.get_header('User-agent'))
    print(6,'user-agent'.capitalize())

    # urllib.parse 模块
    from urllib import parse
    u = {
    'url':'http://www.baidu.com',
    'p_url':'http://www.baidu.com'
    }
    x = parse.urlencode(u)
    print(x)

    u = parse.urlencode({'wd':'中国'}) # 编码
    print(u)
    url = "https://www.baidu.com/s?{}".format(u)
    print(url)
    print('中国'.encode('utf-8'))

    print(parse.unquote(u)) # 解码
    print(parse.unquote(url))

    需求:通过关键字在bing中搜索,返回结果保存在html文件中
    from urllib.request import Request,urlopen
    from urllib.parse import urlencode
    import random
    keyword = input("请输入关键字")
    data = urlencode({'q':keyword})
    base_url = 'http://cn.bing.com/search'
    url = '{}?{}'.format(base_url,data)
    print(url)
    ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
    ua = random.choice(ua_list)
    req = Request(url,headers={'User-agent':ua})
    response = urlopen(req)
    with response:
    with open('1.html','wb') as f:
    f.write(response.read())
    print("success")

    # POST方法
    from urllib.request import Request,urlopen
    from urllib.parse import urlencode
    import simplejson
    import random
    ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
    ua = random.choice(ua_list)
    req = Request('http://httonin.org/post')
    req.add_header('User-agent',ua)
    data = urlencode({'name':'张三,@=/&*','age':'6'})
    print(data)
    res1 = urlopen(req,data='name=张三,@=/&*,&age=6'.encode())# 不做url编码
    res2 = urlopen(req,data=data.encode())#POST方法,Form提交数据
    # with res1:
    # print(res1.read())
    with res2:
    print(res2.read())

    # 豆瓣https://movie.douban.com/explore#!type=movie&tag=%E7%83%AD%E9%97%A8&sort=rank&page_limit=20&page_start=0
    from urllib.request import Request,urlopen
    from urllib.parse import urlencode
    import random

    ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
    ua = random.choice(ua_list)
    url = 'https://movie.douban.com/explore#!'
    req = Request(url)
    req.add_header('User-agent',ua)

    data = urlencode({
    'type':'movie',
    'tag':'热门',
    'sort':'rank',
    'page_limit':8,
    'page_start':10
    })
    # POST 方法
    res = urlopen(req,data=data.encode())
    with res:
    print(res._method)
    print(1,res.read().decode())
    # GET方法
    with urlopen('{}?{}'.format(url,data)) as res:
    print(res._method)
    print(2,res.read().decode())

    from urllib.request import Request,urlopen
    import ssl
    import random
    ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
    ua = random.choice(ua_list)
    request = Request('https://www.12306.cn/mormhweb/')
    request.add_header(
    'User-agent',ua
    )
    # 接受不受信任证书
    context = ssl._create_unverified_context()
    res = urlopen(request,context= context)
    with res:
    print(res._method)
    print(res.geturl())
    print(res.read().decode())

    标准库urllib缺少关键功能,非标准第三方库提供了比如连接池管理
    import urllib3
    import random
    url = 'https://movie.douban.com'
    ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
    ua = random.choice(ua_list)
    with urllib3.PoolManager() as http:
    response = http.request('GET',url,headers={'User-agent':ua})
    print(type(response)) # <class 'urllib3.response.HTTPResponse'>类
    print(response.status,response.reason)
    print(response.headers)
    print(response.data)

    requests库使用了urllib3库,提供可友好的api

    import requests
    import random
    url = 'https://movie.douban.com'
    ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
    ua = random.choice(ua_list)
    response = requests.request('GET',url,headers={'User-Agent':ua})
    with response:
    print(type(response))
    print(response.url)
    print(response.status_code)
    print(response.request.headers)# 请求头
    print(response.headers)# 响应头
    print(response.text)
    with open('movie.html','w',encoding='utf-8') as f:
    f.write(response.text)
    requests默认使用了Session对象,是为了多次与服务器交互保留会话信息:
    # 直接使用session
    import requests
    import random
    ua_list= [ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1","Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50","Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11" ]
    ua = random.choice(ua_list)
    urls = ['https://www.baidu.com','https://www.baidu.com']
    session = requests.Session()
    print(type(session))
    with session:
    for url in urls:
    response = session.get(url,headers={'User-agent':ua})
    with response:
    print(type(response)) # <class 'requests.models.Response'>类
    print(response.url)
    print(response.status_code)
    print('headers',response.request.headers)
    print('cookie',response.cookies)
    print(response.text[:20])


    
    
  • 相关阅读:
    Native RabbitMQ Direct Exchange
    RabbitMQ系列文章导读
    AbstractQueuedSynchronizer
    CountDownLatch和CyclicBarrier
    显示锁Lock
    《SeleniumBasic 3.141.0.0
    《SeleniumBasic 3.141.0.0
    《SeleniumBasic 3.141.0.0
    《SeleniumBasic 3.141.0.0
    《SeleniumBasic 3.141.0.0
  • 原文地址:https://www.cnblogs.com/qyan-blog/p/12153645.html
Copyright © 2011-2022 走看看