zoukankan      html  css  js  c++  java
  • python-爬虫

    1.爬虫的定义:   向网站发起请求,获取资源后分析并提取有用数据的程序

    2.爬虫的基本流程:

    #1、发起请求
    使用http库向目标站点发起请求,即发送一个Request
    Request包含:请求头、请求体等
    
    #2、获取响应内容
    如果服务器能正常响应,则会得到一个Response
    Response包含:html,json,图片,视频等
    
    #3、解析内容
    解析html数据:正则表达式,第三方解析库如Beautifulsoup,pyquery等
    解析json数据:json模块
    解析二进制数据:以b的方式写入文件
    
    #4、保存数据
    数据库
    文件

    3.格式:

        requests.get/post(                     #requests库发送请求将网页内容下载下来以后,并不会执行js代码,这需要我们自己分析目标站点然后发起新的request请求
                url,
                params={} ,    #请求数据
                cookies={},
    
                headers={
                    User-agent:
                    (cookie):
                    Referer:
                    },
                data={
                    如果是get方式,请求体没有内容
                    如果是post方式,请求体是format data
                    },
                allow_redirects=False,   #默认是True
                )

    4.简单示例:

    import requests #pip3 install requests
    import re
    import hashlib
    import time
    
    movie_path=r'C:mp4'
    
    def get_page(url):
        try:
            response=requests.get(url)
            if response.status_code == 200:
                return response.text
        except Exception:
            pass
    
    def parse_index(index_page):
        urls=re.findall('class="items".*?href="(.*?)"',index_page,re.S)
        for url in urls:
            if not url.startswith('http'):
                url='http://www.xiaohuar.com'+url
            yield url
    
    def parse_detail(detail_page):
        l=re.findall('id="media".*?src="(.*?)"',detail_page,re.S)
        if l:
            movie_url=l[0]
            if movie_url.endswith('mp4'):
                yield movie_url
    
    def get_movie(url):
        try:
            response=requests.get(url)
            if response.status_code == 200:
                m=hashlib.md5()
                m.update(str(time.time()).encode('utf-8'))
                m.update(url.encode('utf-8'))
                filepath='%s\%s.mp4' %(movie_path,m.hexdigest())
                with open(filepath,'wb') as f:
                    f.write(response.content)
                    print('%s 下载成功' %url)
        except Exception:
            pass
    
    def main():
        base_url='http://www.xiaohuar.com/list-3-{page_num}.html'
        for i in range(5):
            url=base_url.format(page_num=i)
            index_page=get_page(url)
            detail_urls=parse_index(index_page)
            for detail_url in detail_urls:
                detail_page=get_page(detail_url)
                movie_urls=parse_detail(detail_page)
                for movie_url in movie_urls:
                    get_movie(movie_url)
    
    if __name__ == '__main__':
        main()
    爬取校花网视频(不能并发)
    import requests #pip3 install requests
    import re
    import hashlib
    import time
    from concurrent.futures import ThreadPoolExecutor
    
    pool=ThreadPoolExecutor(50)
    movie_path=r'C:mp4'
    
    def get_page(url):
        try:
            response=requests.get(url)
            if response.status_code == 200:
                return response.text
        except Exception:
            pass
    
    def parse_index(index_page):
        index_page=index_page.result()
        urls=re.findall('class="items".*?href="(.*?)"',index_page,re.S)
        for detail_url in urls:
            if not detail_url.startswith('http'):
                detail_url='http://www.xiaohuar.com'+detail_url
            pool.submit(get_page,detail_url).add_done_callback(parse_detail)
    
    def parse_detail(detail_page):
        detail_page=detail_page.result()
        l=re.findall('id="media".*?src="(.*?)"',detail_page,re.S)
        if l:
            movie_url=l[0]
            if movie_url.endswith('mp4'):
                pool.submit(get_movie,movie_url)
    
    def get_movie(url):
        try:
            response=requests.get(url)
            if response.status_code == 200:
                m=hashlib.md5()
                m.update(str(time.time()).encode('utf-8'))
                m.update(url.encode('utf-8'))
                filepath='%s\%s.mp4' %(movie_path,m.hexdigest())
                with open(filepath,'wb') as f:
                    f.write(response.content)
                    print('%s 下载成功' %url)
        except Exception:
            pass
    
    def main():
        base_url='http://www.xiaohuar.com/list-3-{page_num}.html'
        for i in range(5):
            url=base_url.format(page_num=i)
            pool.submit(get_page,url).add_done_callback(parse_index)
    
    if __name__ == '__main__':
        main()
    爬取校花网视频(并发)

    5. requests模块的详细用法

    import requests
    from urllib.parse import urlencode

    5.1 请求头中要有 'User-Agent'
    keyword = input('>>: ').strip() res = urlencode({'wd': keyword}, encoding='utf-8') url = 'https://www.baidu.com/s?' + res print(url) respone=requests.get(url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }, ) print(respone.status_code) with open('a.html','w',encoding='utf-8') as f: f.write(respone.text) 5.2 参数中可以写url请求中的数据 keyword = input('>>: ').strip() respone=requests.get('https://www.baidu.com/s?', params={ 'wd':keyword, 'pn':20 }, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }, ) print(respone.status_code) with open('a.html','w',encoding='utf-8') as f: f.write(respone.text) response = requests.get('https://www.zhihu.com/explore', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }) print(response.status_code) print(response.text) 5.3 请求头中可以写 cookie response=requests.get( url='https://github.com/settings/emails', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', "Cookie": "_octo=GH1.1.9390043.1516008745; _ga=GA1.2.674621867.1516008745; _gat=1; tz=Asia%2FShanghai; user_session=gPR8zXuoKMY-h9R6WVlju1xa-jBcNRpEoWNOa9k3B922hlqy; __Host-user_session_same_site=gPR8zXuoKMY-h9R6WVlju1xa-jBcNRpEoWNOa9k3B922hlqy; logged_in=yes; dotcom_user=egonLin; _gh_sess=eyJsYXN0X3dyaXRlIjoxNTE2MDA4Nzc1NTkyLCJmbGFzaCI6eyJkaXNjYXJkIjpbXSwiZmxhc2hlcyI6eyJhbmFseXRpY3NfZGltZW5zaW9uIjp7Im5hbWUiOiJkaW1lbnNpb241IiwidmFsdWUiOiJMb2dnZWQgSW4ifX19LCJzZXNzaW9uX2lkIjoiMzllOGI4NjI4ODdjMTFlMmEyYTg5ZDUyMmU0NzQ4ODEifQ%3D%3D--37b89b9cb319fba0e2f7df68bcffe1a56bea7a41", } ) print('378533872@qq.com' in response.text) 5.4 cookie 可以写在外面 response=requests.get( url='https://github.com/settings/emails', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }, cookies={ "k1":"v1", }, ) print('378533872@qq.com' in response.text) 5.5 是否允许请求的页面重定向 response=requests.get( url='https://github.com/settings/emails', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }, cookies={ "k1":"v1", }, allow_redirects=False, )

    6.小练习

    import requests
    import re
    
    # 一:先获取登陆页面,拿到authenticity_token和cookies:
    # 1 请求的url:https://github.com/login
    # 2 请求方法:GET
    # 3 请求头:
    #    User-Agent
    r1 = requests.get('https://github.com/login',
                      headers={
                          'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                      },
                      )
    authenticity_token = re.findall('name="authenticity_token".*?value="(.*?)"', r1.text, re.S)[0]
    r1_cookies=r1.cookies.get_dict()
    print(authenticity_token)
    print(r1_cookies)
    # 二:提交表单数据完成登陆
    # 1 请求的url:https://github.com/session
    # 2 请求方法:POST
    # 3 请求头:
    #    Referer:https://github.com/
    #    User-Agent
    # 4 请求体
    # commit:Sign in
    # utf8:✓
    # authenticity_token:pFLyO9choCgUd6mm1AMP7BoeEQ613TRDe49MBREZ7EU7MKM7IELFgmyGfcKXS0hsaIiGJ8YlkTD5nwwV4tebig==
    # login:378533872@qq.com
    # password:alex3714
    r2 = requests.post('https://github.com/session',
                       headers={
                           "Referer": "https://github.com/",
                           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                       },
                       cookies=r1_cookies,
                       data={
                           "commit": "Sign in",
                           'utf8': "",
                           "authenticity_token": authenticity_token,
                           "login": "32222222@qq.com",
                           "password": "66666",
                       },
                       allow_redirects=False
                       )
    
    # print(r2.status_code)
    # print(r2.history)      #在允许跳转时查看跳转前页面数据
    
    cookies=r2.cookies.get_dict()
    
    r3=requests.get('https://github.com/settings/emails',
                 headers={
                     "Referer": "https://github.com/",
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                 },
                 cookies=cookies)
    print('32222222@qq.com' in r3.text)
    模拟登录git练习

     7.响应Response

     7.1、response属性

    复制代码
    import requests
    respone=requests.get('http://www.jianshu.com')
    # respone属性
    print(respone.text)
    print(respone.content)
    
    print(respone.status_code)
    print(respone.headers)
    print(respone.cookies)
    print(respone.cookies.get_dict())
    print(respone.cookies.items())
    
    print(respone.url)
    print(respone.history)
    
    print(respone.encoding)
    
    #关闭:response.close()
    from contextlib import closing
    with closing(requests.get('xxx',stream=True)) as response:
        for line in response.iter_content():
        pass
    复制代码

     7.2、编码问题

    #编码问题
    import requests
    response=requests.get('http://www.autohome.com/news')
    # response.encoding='gbk' #汽车之家网站返回的页面内容为gb2312编码的,而requests的默认编码为ISO-8859-1,如果不设置成gbk则中文乱码
    print(response.text)
    View Code

     7.3、获取二进制数据

    复制代码
    import requests
    
    response=requests.get('https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1509868306530&di=712e4ef3ab258b36e9f4b48e85a81c9d&imgtype=0&src=http%3A%2F%2Fc.hiphotos.baidu.com%2Fimage%2Fpic%2Fitem%2F11385343fbf2b211e1fb58a1c08065380dd78e0c.jpg')
    
    with open('a.jpg','wb') as f:
        f.write(response.content)
    复制代码
    #stream参数:一点一点的取,比如下载视频时,如果视频100G,用response.content然后一下子写到文件中是不合理的
    
    import requests
    
    response=requests.get('https://gss3.baidu.com/6LZ0ej3k1Qd3ote6lo7D0j9wehsv/tieba-smallvideo-transcode/1767502_56ec685f9c7ec542eeaf6eac93a65dc7_6fe25cd1347c_3.mp4',
                          stream=True)
    
    with open('b.mp4','wb') as f:
        for line in response.iter_content():
            f.write(line)
    获取二进制流

    7.4、解析json

    #解析json
    import requests
    response=requests.get('http://httpbin.org/get')
    
    import json
    res1=json.loads(response.text) #太麻烦
    
    res2=response.json() #直接获取json数据
    
    
    print(res1 == res2) #True
    View Code

    7.5、Redirection and History

    By default Requests will perform location redirection for all verbs except HEAD.
    
    We can use the history property of the Response object to track redirection.
    
    The Response.history list contains the Response objects that were created in order to complete the request. The list is sorted from the oldest to the most recent response.
    
    For example, GitHub redirects all HTTP requests to HTTPS:
    
    >>> r = requests.get('http://github.com')
    
    >>> r.url
    'https://github.com/'
    
    >>> r.status_code
    200
    
    >>> r.history
    [<Response [301]>]
    If you're using GET, OPTIONS, POST, PUT, PATCH or DELETE, you can disable redirection handling with the allow_redirects parameter:
    
    >>> r = requests.get('http://github.com', allow_redirects=False)
    
    >>> r.status_code
    301
    
    >>> r.history
    []
    If you're using HEAD, you can enable redirection as well:
    
    >>> r = requests.head('http://github.com', allow_redirects=True)
    
    >>> r.url
    'https://github.com/'
    
    >>> r.history
    [<Response [301]>]
    先看官网的解释
    import requests
    import re
    
    #第一次请求
    r1=requests.get('https://github.com/login')
    r1_cookie=r1.cookies.get_dict() #拿到初始cookie(未被授权)
    authenticity_token=re.findall(r'name="authenticity_token".*?value="(.*?)"',r1.text)[0] #从页面中拿到CSRF TOKEN
    
    #第二次请求:带着初始cookie和TOKEN发送POST请求给登录页面,带上账号密码
    data={
        'commit':'Sign in',
        'utf8':'',
        'authenticity_token':authenticity_token,
        'login':'317828332@qq.com',
        'password':'alex3714'
    }
    
    
    
    
    
    
    #测试一:没有指定allow_redirects=False,则响应头中出现Location就跳转到新页面,r2代表新页面的response
    r2=requests.post('https://github.com/session',
                 data=data,
                 cookies=r1_cookie
                 )
    
    print(r2.status_code) #200
    print(r2.url) #看到的是跳转后的页面
    print(r2.history) #看到的是跳转前的response
    print(r2.history[0].text) #看到的是跳转前的response.text
    
    
    #测试二:指定allow_redirects=False,则响应头中即便出现Location也不会跳转到新页面,r2代表的仍然是老页面的response
    r2=requests.post('https://github.com/session',
                 data=data,
                 cookies=r1_cookie,
                 allow_redirects=False
                 )
    
    
    print(r2.status_code) #302
    print(r2.url) #看到的是跳转前的页面https://github.com/session
    print(r2.history) #[]
    利用github登录后跳转到主页面的例子来验证它

    8.高级用法:

    使用代理:

    #官网链接: http://docs.python-requests.org/en/master/user/advanced/#proxies
    
    #代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情)
    import requests
    proxies={
        'http':'http://egon:123@localhost:9743',#带用户名密码的代理,@符号前是用户名与密码
        'http':'http://localhost:9743',
        'https':'https://localhost:9743',
    }
    respone=requests.get('https://www.12306.cn',
                         proxies=proxies)
    
    print(respone.status_code)
    
    
    
    #支持socks代理,安装:pip install requests[socks]
    import requests
    proxies = {
        'http': 'socks5://user:pass@host:port',
        'https': 'socks5://user:pass@host:port'
    }
    respone=requests.get('https://www.12306.cn',
                         proxies=proxies)
    
    print(respone.status_code)
    View Code

     9.练习

    import requests
    import re
    
    session = requests.session()
    
    # 第一步:访问登陆页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
    # 1、请求url:https://passport.lagou.com/login/login.html
    # 2、请求方法:GET
    # 3、请求头:
    #    User-agent
    r1 = session.get('https://passport.lagou.com/login/login.html',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                     },
                     )
    
    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
    # print(X_Anti_Forge_Token,X_Anti_Forge_Code)
    
    # 第二步:登陆
    # 1、请求url:https://passport.lagou.com/login/login.json
    # 2、请求方法:POST
    # 3、请求头:
    #    cookie
    #    User-agent
    #    Referer:https://passport.lagou.com/login/login.html
    #    X-Anit-Forge-Code:53165984
    #    X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
    #    X-Requested-With:XMLHttpRequest
    # 4、请求体:
    # isValidate:true
    # username:18611453110
    # password:70621c64832c4d4d66a47be6150b4a8e
    # request_form_verifyCode:''
    # submit:''
    r2 = session.post('https://passport.lagou.com/login/login.json',
                      headers={
                          'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                          'Referer': 'https://passport.lagou.com/login/login.html',
                          'X-Anit-Forge-Code': X_Anti_Forge_Code,
                          'X-Anit-Forge-Token': X_Anti_Forge_Token,
                          'X-Requested-With': 'XMLHttpRequest'
                      },
                      data={
                          "isValidate": True,
                          'username': '18611453110',
                          'password': '70621c64832c4d4d66a47be6150b4a8e',
                          'request_form_verifyCode': '',
                          'submit': ''
                      }
                      )
    # 第三步:授权
    # 1、请求url:https://passport.lagou.com/grantServiceTicket/grant.html
    # 2、请求方法:GET
    # 3、请求头:
    #    User-agent
    #    Referer:https://passport.lagou.com/login/login.html
    
    r3 = session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                         'Referer': 'https://passport.lagou.com/login/login.html',
                     }
                     )
    
    # 第四步:验证
    r4 = session.get('https://www.lagou.com/resume/myresume.html',
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                     }
                     )
    
    # print('18611453110' in r4.text)
    
    
    # 第五步:筛选职位信息
    # 请求url:https://www.lagou.com/jobs/list_java%E9%AB%98%E7%BA%A7%E5%BC%80%E5%8F%91
    # 请求方法:GET
    # 请求头:
    # User-Agent
    # 请求参数:
    # gj:3年及以下
    # px:default
    # yx:25k-50k
    # city:北京
    from urllib.parse import urlencode
    
    res = urlencode({'k': 'java高级开发'}, encoding='utf-8').split('=')[-1]
    url = 'https://www.lagou.com/jobs/list_' + res
    #
    # r5 = session.get(url,
    #                  headers={
    #                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    #                  },
    #                  params={
    #                      'gj': '3年及以下',
    #                      'px': 'default',
    #                      'yx': '25k-50k',
    #                      'city': '北京'
    #                  }
    #                  )
    #
    # print(r5.text)
    
    # 请求url:https://www.lagou.com/jobs/positionAjax.json
    # 请求方法:POST
    # 请求头
    #    Referer
    #    User-Agent
    # 请求体:
    # first:true
    # pn:1
    # kd:java高级开发
    # 请求参数
    # params={
    #      'gj': '3年及以下',
    #      'px': 'default',
    #      'yx': '25k-50k',
    #      'city': '北京',
    #     'needAddtionalResult':False,
    #     'isSchoolJob':0
    # }
    r6 = session.post('https://www.lagou.com/jobs/positionAjax.json',
                      headers={
                          'Referer': url,
                          'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    
                      },
                      data={
                          'first': True,
                          'pn': 2,
                          'kd': 'java高级开发'
                      },
                      params={
                          'gj': '3年及以下',
                          'px': 'default',
                          'yx': '25k-50k',
                          'city': '北京',
                          'needAddtionalResult': False,
                          'isSchoolJob': 0
                      }
                      )
    
    from pprint import pprint
    
    # print(r6.json())
    comapines_list = r6.json()['content']['positionResult']['result']
    for comapiny in comapines_list:
        positionId = comapiny['positionId']
        company_link = 'https://www.lagou.com/jobs/{pos_id}.html'.format(pos_id=positionId)
        companyShortName = comapiny['companyShortName']
        positionName = comapiny['positionName']
        salary = comapiny['salary']
        print('''
        详情连接:%s
        公司名:%s
        职位名:%s
        薪资:%s
        ''' % (company_link, companyShortName, positionName, salary))
    
        # 第七步:访问详情页,拿到X_Anti_Forge_Token,X_Anti_Forge_Code
        # 请求url:详情页地址
        # 请求方式:GET
        # 请求头:User-Agent
        r7 = session.get(company_link,
                         headers={
                             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                         }
                         )
        X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r7.text, re.S)[0]
        X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r7.text, re.S)[0]
        # print(X_Anti_Forge_Token,X_Anti_Forge_Code)
    
        # 第八步:投递简历
        # 请求url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
        # 请求方式:POST
        # 请求头:
        # Referer:详情页地址
        # User-agent
        # X-Anit-Forge-Code:53165984
        # X-Anit-Forge-Token:3b6a2f62-80f0-428b-8efb-ef72fc100d78
        # X-Requested-With:XMLHttpRequest
        # 请求体:
        # positionId:职位ID
        # type:1
        # force:true
    
        session.post('https://www.lagou.com/ mycenterDelay/deliverResumeBeforce.json',
        headers = {
                      'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                      'Referer': company_link,
                      'X-Anit-Forge-Code': X_Anti_Forge_Code,
                      'X-Anit-Forge-Token': X_Anti_Forge_Token,
                      'X-Requested-With': 'XMLHttpRequest'
                  },
        data = {
            'positionId': positionId,
            'type': 1,
            'force': True
        })
    print('%s 投递成功' % (companyShortName))
    爬取拉勾网自动投递简历
  • 相关阅读:
    内联函数与宏定义
    三色塔汉诺塔 三色
    Volatile C
    阶乘 简单递归实现
    双色汉诺塔 算法 (递归)
    向上向下排序
    Convert、Parse、TryParse、(int)等区别
    ToString()、Convert.ToString()、(string)、as string 的区别[转]
    ASP.NET页面刷新方法大集合
    getElementByID,getElementsByName,getElementsByTagName的区别
  • 原文地址:https://www.cnblogs.com/liuwei0824/p/8297332.html
Copyright © 2011-2022 走看看