zoukankan      html  css  js  c++  java
  • 爬虫一 发请求&定制请求&异常处理&配置代理

    一、urllib库request

     1 import urllib.request
     2 
     3 url1 = "http://www.baidu.com"
     4 image_url = 'https://ss0.bdstatic.com/94oJfD_bAAcT8t7mm9GUKT-xh_/timg?image&quality' 
     5             '=100&size=b4000_4000&sec=1561451694&di=4123b89e27e7f8d6091dfedc3e222d5a&src' 
     6             '=http://b-ssl.duitang.com/uploads/item/201711/01/20171101201000_UBjmK.jpeg'
     7 
     8 '''方法一:urlopen'''
     9 rep = urllib.request.urlopen(url=url1)     #发送请求并接收响应
    10 #print(rep)
    11 #print(rep.read().decode())   #读取内容,并将字节类型解码为字符串类型
    12 # print(rep.geturl())       #获取url
    13 # print(rep.getheaders())   #获取头信息
    14 # print(rep.getcode())      #获取状态码
    15 # print(rep.readlines())    #按行读取
    16 
    17 rep2 = urllib.request.urlopen(url=image_url)
    18 # with open('gaolu.jpeg','wb') as fp:       #保存读取的内容到文件
    19 #     fp.write(rep2.read())
    20 
    21 '''方法二:urlretrieve'''
    22 rep3 = urllib.request.urlretrieve(image_url,'gl.jpeg')

    二、urllib库parse

    import urllib.parse
    
    '''方法一:quote和unquote'''
    url = 'http://www.baidu.com/index.html?name=狗蛋&pwd=123456'
    ret = urllib.parse.quote(url)       #url编码,转换为url规定字符
    print(ret)
    ret2 = urllib.parse.unquote(ret)    #url解码
    print(ret2)
    
    '''方法二:urlencode'''
    url2 = 'http://www.baidu.com/index.html'
    data = {'name':'狗蛋',
            'age':15}
    
    '''手动拼接url'''
    lt = []
    for k,v in data.items():
        lt.append(k + '=' + str(v))
    query_string = '&'.join(lt)
    print(query_string)
    url3 = url2 + '?' + query_string
    print(url)
    
    '''urlencode拼接url'''
    query_string2 = urllib.parse.urlencode(data)   #将字典格式的数据转为url格式
    print(query_string2)

    三、request.Rquest创建请求对象

    import urllib.request
    import urllib.parse
    
    url1 = 'http://www.baidu.com/'
    
    '''定制请求头,伪装自己——反'反爬'第一步'''
    headers1 = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    
    req = urllib.request.Request(url=url1,headers=headers1)       #构建请求对象
    rep = urllib.request.urlopen(req)
    #print(rep.read().decode())
    print(rep.getheaders())

    四、post请求

     1 import urllib.request
     2 import urllib.parse
     3 
     4 post_url = 'https://fanyi.baidu.com/sug'
     5 
     6 word = input('>>>:')
     7 ''' 构建post表单数据'''
     8 data = {'kw':word,}
     9 form_data = urllib.parse.urlencode(data).encode()    #将字典格式的数据转为url格式
    10 
    11 headers1 = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
    12                          'AppleWebKit/537.36 (KHTML, like Gecko) '
    13                           'Chrome/73.0.3683.86 Safari/537.36'}
    14 
    15 req = urllib.request.Request(url=post_url,headers=headers1)
    16 
    17 rep = urllib.request.urlopen(req,data = form_data)        #发送post请求
    18 
    19 ret = rep.read().decode()
    20 
    21 print(ret)

    五、Ajax-get请求

    import urllib.request
    import urllib.parse
    
    '''豆瓣排行榜接口:https://movie.douban.com/j/chart/top_list?type' 
          '=24&interval_id=100%3A90&action=&start=120&limit=20'''
    
    url = 'https://movie.douban.com/j/chart/top_list?type' 
          '=24&interval_id=100%3A90&action=&'
    
    dic_data = {'start':4,
                'limit':5}     #定制参数
    data = urllib.parse.urlencode(dic_data)
    url += data
    
    headers1 = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) '
                              'Chrome/73.0.3683.86 Safari/537.36'}
    
    req = urllib.request.Request(url=url,headers=headers1)
    
    rep = urllib.request.urlopen(req)
    print(rep.read().decode())

    六、百度贴吧例子

    import urllib.request
    import urllib.parse
    import os
    
    '''获取百度贴吧指定吧名和页码的帖子,并写入指定文件'''
    
    url = 'http://tieba.baidu.com/f?ie=utf-8&'
    
    headers1 = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                                 'AppleWebKit/537.36 (KHTML, like Gecko) '
                                  'Chrome/73.0.3683.86 Safari/537.36'}
    
    ba_name = input("请输入贴吧名:")
    start_page = int(input("请输入起始页码:"))
    end_page = int(input("请输入结束页码:"))
    
    if not os.path.exists(ba_name):
        os.mkdir(ba_name)
    
    for page in range(start_page,end_page+1):
    
        data = {'kw':ba_name,
                'pn':'(page-1)*50',}      #参数设置
    
        form_data = urllib.parse.urlencode(data)    #将字典格式的数据转为url格式
        post_url = url + form_data     #拼接url
    
        req = urllib.request.Request(url=post_url,headers=headers1)     #创建请求对象
    
        rep = urllib.request.urlopen(req)        #发送请求
    
        ret = rep.read()
    
        filename = str(page) + '.html'
        filepath = ba_name + '/' + filename
        with open(filepath,'wb') as fp:
            fp.write(ret)

    七、urlError

    import urllib.request
    import urllib.parse
    import urllib.error
    
    url = 'http://www.maodan.com'
    
    '''Exception:万能的异常捕获类'''
    # try:
    #     rep = urllib.request.urlopen(url)
    #     print(rep)
    # except Exception as e:
    #     print(e)
    
    '''用URLError精确捕获'''
    try:
        rep = urllib.request.urlopen(url)
        print(rep)
    except urllib.error.URLError as e:
        print(e)

    八、HTTPError

    import urllib.request
    import urllib.parse
    import urllib.error

    url = 'https://www.cnblogs.com/java-chen-hao/p/1108374.html'

    '''用URLError精确捕获'''
    try:
    rep = urllib.request.urlopen(url)
    print(rep)
    except urllib.error.HTTPError as e:
    print(e)
    except urllib.error.URLError as e:
    print(e)
    '''HTTPError是URLError的子类,都能捕获httpError。
    两者同时捕获时,HTTPError写在前面。儿子先上,儿子不行父亲上'''

    九、handler与opener

    import urllib.request
    import urllib.parse
    
    url = 'http://www.baidu.com/'
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    
    '''创建一个HTTPhandler'''
    my_handler = urllib.request.HTTPHandler()
    '''通过handler创建一个opener'''
    my_opener = urllib.request.build_opener(my_handler)
    
    '''构建请求对象'''
    req = urllib.request.Request(url,headers=headers)
    '''发送请求'''
    rep = my_opener.open(req)
    print(rep.read().decode())

    十、代码配置代理

    import urllib.request
    import urllib.parse
    
    '''创建一个Proxyhandler'''
    my_handler = urllib.request.ProxyHandler({'http':'114.215.95.188:3128'})
    
    my_opener = urllib.request.build_opener(my_handler)
    
    '''百度查询IP地址的URL'''
    url = 'https://www.baidu.com/s?ie=utf-8&wd=IP'
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) '
                             'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    
    req = urllib.request.Request(url,headers=headers)
    
    rep = my_opener.open(req)
    
    with open('ip.html','wb') as fp:
        fp.write(rep.read())
  • 相关阅读:
    JQuery帮助文档整理
    将table信息导出到excel
    jquery 选择器
    程序不小心出现死锁的解决方法
    兼容IE firefox 的全英文自动换行
    第三周2
    Jquery操作Select集锦
    Silverlight初学之:如果调用Silverlight项目中不同的控件
    【转载】很好的解决了asp.net页面ViewState过大的问题
    js得到url的各个部分【转】
  • 原文地址:https://www.cnblogs.com/Finance-IT-gao/p/11099500.html
Copyright © 2011-2022 走看看