zoukankan      html  css  js  c++  java
  • Python开发之爬虫模块介绍(一)

    Urllib库

    Urllib是Python内置的HTTP请求库,包括了4个模块:

    • urllib.request 请求模块
    • urllib.error 异常处理模块
    • urllib.parse url解析模块
    • urllib.robotparser robots.txt解析模块

    1、urllib.request 的urlopen()

    import urllib.request
    
    response = urllib.request.urlopen('http://www.baidu.com')
    print(response.read().decode('utf8'))
    
    response = urllib.request.urlopen('http://httpbin.org/get',timeout=1) # 设置超时时间
    print(response.read())
    
    import urllib.request
    import socket
    import urllib.error
    try:
        response = urllib.request.urlopen('http://httpbin.org/get',timeout=0.1)
    except urllib.error.URLError as e:
        if isinstance(e.reason,socket.timeout):
            print('TIME OUT')
    

    2、响应

    • 响应类型
    import urllib.request
    
    response = urllib.request.urlopen('http://www.baidu.com')
    print(type(response)) # <class 'http.client.HTTPResponse'>
    
    • 状态码、响应头
    import urllib.request
    
    response = urllib.request.urlopen('http://www.python.org')
    print(response.status) # 200
    print(response.getheaders())
    print(response.getheader('Server')) #nginx
    

    3、urllib.request.Request()

    from urllib import request,parse
    
    url = 'http://httpbin.org/post'
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        'Host':'httpbin.org'
    }
    
    dict = {
        'name':'Germey'
    }
    
    data = bytes(parse.urlencode(dict),encoding='utf8')
    req = request.Request(url=url,data=data,headers=headers,method='POST') # 是一个object
    response = request.urlopen(req)
    print(response.read().decode('utf8')) 
    

    利用request.Request()方法可以灵活的构造要请求的内容和类型

    还可以利用.add_header()方法来添加headers

    from urllib import request,parse
    
    url = 'http://httpbin.org/post'
    
    dict = {
        'name':'Germey'
    }
    
    data = bytes(parse.urlencode(dict),encoding='utf8')
    req = request.Request(url=url,data=data,method='POST') # 是一个object
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) ') 
    response = request.urlopen(req)
    print(response.read().decode('utf8'))
    

    4、urllib.request.ProxyHandler()实现代理

    import urllib.request
    
    proxy_handler = urllib.request.ProxyHandler({
        'http':'http://127.0.0.1:9743',
        'https':'https://127.0.0.1:9743',
    })
    
    opener = urllib.request.build_opener(proxy_handler)
    response = opener.open('http://httpbin.org/get')
    print(response.read())
    

    5、cookies

    import http.cookiejar,urllib.request
    
    cookie = http.cookiejar.CookieJar()
    handler = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(handler) # 也是代理的一种新式
    response = opener.open('http://www.baidu.com')
    for item in cookie:
        print(item.name + '='+ item.value)
    
    '''
    BAIDUID=A980763F2538BCB3FDA9E5BC979758CB:FG=1
    BIDUPSID=A980763F2538BCB3FDA9E5BC979758CB
    H_PS_PSSID=1453_26909_21094_18559_26350
    PSTM=1533972705
    BDSVRTM=0
    BD_HOME=0
    delPer=0
    '''
    

    把cookie保存成文本文件

    import http.cookiejar,urllib.request
    filename = 'cookie.txt'
    cookie = http.cookiejar.MozillaCookieJar(filename)
    handler = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(handler) # 也是代理的一种新式
    response = opener.open('http://www.baidu.com')
    cookie.save(ignore_discard=True,ignore_expires=True)
    

    6、异常处理

    from urllib import request,error
    
    try:
        response = urllib.request.urlopen('http://cuiqingcai/index.html')
    except error.URLError as e:
        print(e.reason) #[Errno 11004] getaddrinfo failed
    from urllib import request,error
    
    try:
        response = urllib.request.urlopen('http://cuiqingcai/index.html')
    except error.HTTPError as e:
        print(e.reason,e.code,e.headers,sep='\n') #[Errno 11004] getaddrinfo failed
    except error.URLError as e:
        print(e.reason)
    else:
        print('Request successful')
    

    先捕获HTTPError,再捕获URLError。

    7、URL解析

    from urllib.parse import urlparse
    
    result = urlparse('https://www.suning.com/?vip_frm=super_nav_vip')
    print(type(result),result)
    '''
    <class 'urllib.parse.ParseResult'> ParseResult(scheme='https', netloc='www.suning.com', path='/', params='', query='vip_frm=super_nav_vip', fragment='')
    '''
    # 把url进行标准的拆分
    

     8、urlencode 可以将字典转换为get请求的参数

    from urllib.parse import urlencode
    
    params = {
        'name':'germay',
        'age':'12'
    }
    
    base_url = 'http://www.baidu.com?'
    url = base_url + urlencode(params)
    print(url) #http://www.baidu.com?name=germay&age=12
    

     Requests库

    requests库相比urllib库来说,比urllib库要方便许多,先来简单体验一下

    import requests
    
    response = requests.get('http://www.baidu.com')
    print(response) # 200
    print(response.status_code)
    print(type(response.text),response.text) #<class 'str'>
    print(response.cookies) # <RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>
    

    1、requests库的各种请求方式

    import requests
    print(requests.get('http://httpbin.org/get'))
    print(requests.post('http://httpbin.org/post'))
    print(requests.delete('http://httpbin.org/delete'))
    print(requests.put('http://httpbin.org/put'))
    print(requests.options('http://httpbin.org/get'))
    print(requests.head('http://httpbin.org/get'))
    

    2、get请求

    import requests
    response = requests.get('http://httpbin.org/get')
    print(response.text)
    '''
    {
      "args": {}, 
      "headers": {
        "Accept": "*/*", 
        "Accept-Encoding": "gzip, deflate", 
        "Connection": "close", 
        "Host": "httpbin.org", 
        "User-Agent": "python-requests/2.18.4"
      }, 
      "origin": "113.59.106.145", 
      "url": "http://httpbin.org/get"
    }
    '''
    

    3、带参数的get请求

    import requests
    response = requests.get('http://httpbin.org/get?name=germay&age=22')
    print(response.text)
    '''
    {
      "args": {
        "age": "22", 
        "name": "germay"
      }, 
      "headers": {
        "Accept": "*/*", 
        "Accept-Encoding": "gzip, deflate", 
        "Connection": "close", 
        "Host": "httpbin.org", 
        "User-Agent": "python-requests/2.18.4"
      }, 
      "origin": "113.59.106.145", 
      "url": "http://httpbin.org/get?name=germay&age=22"
    }
    '''
    
    import requests
    data = {
       'name':'germay',
        'age':22
    }
    response = requests.get('http://httpbin.org/get',params=data)
    print(response.text)
    

    4、json解析

    import requests
    import json
    response = requests.get('http://httpbin.org/get')
    print(response.json())
    print(json.loads(response.text))
    '''
    {'args': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'close', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.18.4'}, 'origin': '113.59.106.145', 'url': 'http://httpbin.org/get'}
    {'args': {}, 'headers': {'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'close', 'Host': 'httpbin.org', 'User-Agent': 'python-requests/2.18.4'}, 'origin': '113.59.106.145', 'url': 'http://httpbin.org/get'}
    '''
    

    response.json()和json.loads(response.text)一样。

    5、获取二进制数据

    import requests
    response = requests.get('https://github.com/favicon.ico')
    print(type(response.text)) #<class 'str'>
    print(type(response.content))  #<class 'bytes'>
    

    .text是str类型,.content是bytes类型

    import requests
    response = requests.get('https://github.com/favicon.ico')
    print(type(response.text)) #<class 'str'>
    print(type(response.content))  #<class 'bytes'>
    
    with open('favicon.ico','wb') as f:
        f.write(response.content)
    

    6、添加headers

    不加header爬取知乎的结果

    import requests
    response = requests.get('https://www.zhihu.com/explore')
    print(response.text)
    '''
    <html>
    <head><title>400 Bad Request</title></head>
    <body bgcolor="white">
    <center><h1>400 Bad Request</h1></center>
    <hr><center>openresty</center>
    </body>
    </html> 
    '''
    

    加headers爬取知乎的结果:是完整的这个页面

    import requests
    headers = {
        'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    response = requests.get('https://www.zhihu.com/explore',headers=headers)
    print(response.text)
    

    所以,在网页爬取时,添加headers是非常必要的,不加的话很可能被禁掉。

    7、响应

    response属性

    import requests
    headers = {
        'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    response = requests.get('https://www.zhihu.com/explore',headers=headers)
    print(response.status_code) # 200
    print(response.headers)
    print(response.cookies)
    print(response.url) #https://www.zhihu.com/explore
    print(response.history)
    

    8、文件上传

    import requests
    
    files = {"file":open('favicon.ico','rb')}
    
    response = requests.post('http://httpbin.org/post',files=files)
    print(response.text)
    

    9、获取cookie

    import requests
    
    response = requests.get('https://www.baidu.com')
    print(response.cookies)
    for k,v in response.cookies.items():
        print(k+'='+v)
    

    10、会话维持

    import requests
    
    s = requests.session()
    s.get('http://httpbin.org/cookies/set/number/1122112')
    response = s.get('http://httpbin.org/cookies')
    print(response.text)
    '''
    {
      "cookies": {
        "number": "1122112"
      }
    }
    '''

    11、证书验证

    import requests
    
    response = requests.get('https://www.12306.cn')
    print(response.status_code) #requests.exceptions.SSLError
    

    上述代码访问12306直接报SSLError错误

    如果加上verify=False,就可以正常爬取

    import requests
    from requests.packages import urllib3
    urllib3.disable_warnings() # 消除警告的作用
    response = requests.get('https://www.12306.cn',verify=False)
    print(response.status_code) #requests.exceptions.SSLError
    

    12、代理

    import requests
    proxy = ({
        'http':'http://127.0.0.1:9743',
        'https':'https://127.0.0.1:9743',
    })
    
    response = requests.get('https://www.taobao.com',proxies=proxy)
    print(response.status_code)
    

    13、超时设置

    import requests
    
    response = requests.get('https://www.taobao.com',timeout=1)
    print(response.status_code)
    

    14、认证设置

    对于一些需要登录才能访问的网站,需要进行认证处理

    import requests
    
    response = requests.get('https://127.27.34.24:9001',auth=('user','123'))
    print(response.status_code)
    

    15、异常处理

    import requests
    from requests.exceptions import ReadTimeout,HTTPError,RequestException
    try:
        response = requests.get('https://www.taobao.com',timeout=1)
        print(response.status_code)
    except ReadTimeout:
        print('Timeout')
    except HTTPError:
        print('Http error')
    except RequestException:
        print('error')
    

      

  • 相关阅读:
    Docker网络简介
    Dockerfile数据管理
    Dockerfile指令详解下
    Dockerfile指令详解上
    设计模式之装饰器模式
    设计模式之适配器模式
    Java NIO的工作方式
    使用Dockerfile定制镜像
    jquery+asp.net 调用百度geocoder手机浏览器定位--Api介绍及Html定位方法
    js 取父级 页面上的元素
  • 原文地址:https://www.cnblogs.com/crazyforever/p/5060818.html
Copyright © 2011-2022 走看看