zoukankan      html  css  js  c++  java
  • 基于python的爬虫(一)

    一.爬虫的基本流程:

    # 1、发起请求: 
      使用http库向目标站点发起请求,即发送一个Request
      Request包含:请求头、请求体等
    
    # 2、获取响应内容
      如果服务器能正常响应,则会得到一个Response
      Response包含:html,json,图片,视频等
    
    # 3、解析内容
      解析html数据:正则表达式,第三方解析库如Beautifulsoup,pyquery等
      解析json数据:json模块
      解析二进制数据:以b的方式写入文件
    
    # 4、保存数据
      数据库
      文件

    二.我们来爬一个校花网

    import requests
    import re
    
    # 爬虫三部曲
    
    # 一 发送请求
    def get_page(url):
        
        index_res = requests.get(url)
        return index_res.text
    
    # 二 解析数据
    # 解析主页
    def parse_index(index_page):
        detail_urls = re.findall('<div class="items">.*?href="(.*?)"', index_page, re.S)
        # print(detail_urls)
    
        for detail_url in detail_urls:
            if not detail_url.startswith('http'):
                detail_url = 'http://www.xiaohuar.com' + detail_url
            yield detail_url
    
    # 解析详情页
    def parse_detail(detail_page):
        video_urls = re.findall('id="media".*?src="(.*?)"', detail_page, re.S)
    
        if video_urls:
            video_urls = video_urls[0]
            if video_urls.endswith('.mp4'):
                yield video_urls
    
        # print(video_urls)
    
    # 三 保存数据
    import uuid
    def save_video(video_url):
        try:
            res = requests.get(video_url)
            with open(r'D:pachongmovies\%s.mp4' % uuid.uuid4(), 'wb') as f:
                f.write(res.content)
                f.flush()
    
        except Exception:
            pass
    
    
    if __name__ == '__main__':
        base_url = 'http://www.xiaohuar.com/list-3-{}.html'
        for line in range(5):
            index_url = base_url.format(line)
    
            index_page = get_page(index_url)
            detail_urls = parse_index(index_page)
            for detail_url in detail_urls:
                detail_page = get_page(detail_url)
                video_urls = parse_detail(detail_page)
                for video_url in video_urls:
                    save_video(video_url)

    并发版:

    # pip3 install requests
    import requests
    import re
    from concurrent.futures import ThreadPoolExecutor
    
    pool = ThreadPoolExecutor(50)
    
    # 爬虫三部曲
    
    # 一 发送请求
    def get_page(url):
        print('%s GET start ...' % url)
        index_res = requests.get(url)
        return index_res.text
    
    # 二 解析数据
    # 解析主页
    def parse_index(index_page):
        # 拿到主页的返回结果
        res = index_page.result()
        detail_urls = re.findall('<div class="items">.*?href="(.*?)"', res, re.S)
        # print(detail_urls)
    
        for detail_url in detail_urls:
            if not detail_url.startswith('http'):
                detail_url = 'http://www.xiaohuar.com' + detail_url
    
            pool.submit(get_page, detail_url).add_done_callback(parse_detail)
            # yield detail_url
    
    # 解析详情页
    def parse_detail(detail_page):
        res = detail_page.result()
    
        video_urls = re.findall('id="media".*?src="(.*?)"', res, re.S)
    
        if video_urls:
            video_urls = video_urls[0]
            if video_urls.endswith('.mp4'):
                pool.submit(save_video, video_urls)
    
        # print(video_urls)
    
    
    # 三 保存数据
    import uuid
    def save_video(video_url):
        try:
            res = requests.get(video_url)
            with open(r'D:	ankday01movies\%s.mp4' % uuid.uuid4(), 'wb') as f:
                f.write(res.content)
                f.flush()
                print('%s done ...' % video_url)
    
        except Exception:
            pass
    
    
    if __name__ == '__main__':
        base_url = 'http://www.xiaohuar.com/list-3-{}.html'
        for line in range(5):
            index_url = base_url.format(line)
            pool.submit(get_page, index_url).add_done_callback(parse_index)

    三.request的基本使用

          1.get请求的两种方式:

    import requests
    from urllib.parse import urlencode
    
    # 请求url
    base_url = 'https://www.baidu.com/s?' + urlencode({"wd": "美女"})
    # 请求头
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    }
    
    # 请求方法 GET
    res = requests.get(base_url, headers=headers)
     # print(res)            一个response对象 # print(res.text)       整个html文本 # print(res.content)    二进制内容with open('meinv.html', 'w', encoding='utf-8') as f:
        f.write(res.text)

           每次url编码会很麻烦,所以可以在GET内添加参数即可:

    import requests
    
    # 请求url
    base_url = 'https://www.baidu.com/s?'
    
    # # 请求头
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    }
    #
    # 请求方法 GET
    res = requests.get(base_url, headers=headers, params={"wd": "黄云"})
    
    with open('小云云.html', 'w', encoding='utf-8') as f:
        f.write(res.text)

           get请求访问知乎:

    # 访问知乎
    
    # 请求url
    zhi_url = 'https://www.zhihu.com/explore'
    
    # # 请求头
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    
    }
    # 请求方法 GET
    res = requests.get(zhi_url, headers=headers)
    with open('知乎.html', 'w', encoding='utf-8') as f:
        f.write(res.text)

        get请求访问github:

    # # 请求头,登录后的主页
    url='https://github.com/settings/emails'
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
    'Cookie': 'has_recent_activity=1; _ga=GA1.2.1150787574.1561264746; _octo=GH1.1.800236184.1561264778; _device_id=e38cc770a7f91ac7001f3b1e23185943; user_session=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; __Host-user_session_same_site=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; logged_in=yes; dotcom_user=pengsima; _gat=1; tz=Asia%2FShanghai; _gh_sess=U0hueWR2WmcvMEJ3amVCTFpOVm5KUDFob1FQUHBtd1BYK09ENkU0aTBqK1JrYmFiYTd6K3pLb0pSVDV5UzdOU0oxbGluSDR3dmVJYTA3WlVpaHZ2cWJmQTJrVTQzRHVFa1cvT1hrWG1ON1ZMRm1DeEtkQkhDRUVaK2cwUUpRN29UUnlyWnRCODQ3cTRLYWZkcmN5UHdnPT0tLUgxSmxJMUQzWDllblhFT3JMK083Tnc9PQ%3D%3D--92e621b5b1d19cf03e157bf61e02ded6a1a248c6'
    
    }
    # # 请求头,email
    headers_2 = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
    'Cookie':'has_recent_activity=1; _ga=GA1.2.1150787574.1561264746; _octo=GH1.1.800236184.1561264778; _device_id=e38cc770a7f91ac7001f3b1e23185943; user_session=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; __Host-user_session_same_site=HtVIP7s1AnJA8pBp9PPJN5onsJZ_AJ0mnhXKm-IkGuPYMzDi; logged_in=yes; dotcom_user=pengsima; _gat=1; tz=Asia%2FShanghai; _gh_sess=SE5mdjlBaWtla3B2czNYZFI5UTF6TEhUbERvellXVTZnUVE3d0hjTDBTb3RtZ0UxTXhYSCt4S2h2NXR2c3h2YVNaZUNITHlCOE9GcmhIM2lweVFVellYMExxV3dEK0R1ZU15cUEycmxIRk4yZW1WT2J5c3hFVHZ4Y3ZOaUhBN0ZseWcyTmMwNWxPTEIrMmpnVVpKRUJRPT0tLTdNcFZsOTFidnpxZk05cWVZUmV0MkE9PQ%3D%3D--6064098de4400f5a7ac71cdd3806abd11b2a0134'
    }
    # 请求方法 GET
    # res = requests.get(url, headers=headers_2)
    res = requests.get(url, headers=headers)
    with open('github.html', 'w', encoding='utf-8') as f:
        f.write(res.text)
    print('1059239165' in res.text)
    
    # True

       2.post请求

    # 第一步 https://github.com/login  >>>>  获取tocken
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    
    }
    
    login_res = requests.get('https://github.com/login', headers=headers)
    #
    
    authenticity_token = re.findall('name="authenticity_token".*?value="(.*?)"', login_res.text, re.S)[0]
    print(
        authenticity_token
    )
    # 第二步拿到cookies
    cookies = {}
    
    
    # 把login_cookies放进cookies字典内
    cookies.update(login_res.cookies.get_dict())
    print(cookies)
    
    
    # 第三步 往session发送post请求
    # 请求方法 POST
    
    # 请求url
    #     https://github.com/session  POST
    
    # 请求体
    
    form_data = {
        "commit": "Sign in",
        "utf8": "",
        "authenticity_token": authenticity_token,
        "login": "pengsima",
        "password": "oa09116611",
        "webauthn-support":" supported"
    
    }
    
    # json
    # requests.post('https://github.com/session', headers=headers, json=form_data)
    res = requests.post('https://github.com/session', headers=headers, data=form_data, cookies=cookies)
    
    # print(res.status_code)
    
    with open('github.html', 'w', encoding='utf-8') as f:
        f.write(res.text)

     3.response

    import requests
    
    baidu = 'https://www.baidu.com/'
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    
    }
    
    res = requests.get(baidu, headers=headers)
    
    # 返回响应状态码
    print(res.status_code)
    
    print(res)
    # 响应头
    print(res.headers)
    
    # 响应文本
    print(res.text)
    
    print(res.url)
    #
    print(res.cookies)
    print(res.cookies.get_dict())
    
    print(res.encoding)
    # res.encoding = 'utf-8'
    # print(res.encoding)
    
    print(res.history)
    
    print(res.content)

    下载一张图片:

    bo = 'https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1551942493340&di=afa19a1f5a3a4fbdec983baaeb1954f0&imgtype=0&src=http%3A%2F%2Fwww.xnnews.com.cn%2Fwenyu%2Flxsj%2F201611%2FW020161114828261827516.jpg'
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    
    }
    
    res = requests.get(bo, headers=headers, stream=True)
    with open('bo2.jpg', 'wb') as f:
        for line in res.iter_content():
            # f.write(res.content)
            f.write(line)

    补充:

      取消重定向(默认为True):
      allow_redriects=False

     4.session用法:

    import requests
    import re
    
    session = requests.session()
    
    
    # 一 往login发送get请求: 获取token
    '''
    name="authenticity_token" value="/pE5/yY3Ibm1z0CgiSrqZheBOGQl+rPLs491/TOUL0sRIaQFQzS/s/er5eC/xxEO2AGY0l39b0rEStW/A6Bngg=="
    '''
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36',
        # 'Cookies'
    }
    
    login_res = session.get('https://github.com/login', headers=headers)
    
    authenticity_token = re.findall('name="authenticity_token".*?value="(.*?)"', login_res.text, re.S)[0]
    
    
    
    # 二 往session发送post请求
    # 请求方法 POST
    
    # 请求url
    #     https://github.com/session  POST
    
    # 请求体
    
    form_data = {
        "commit": "Sign in",
        "utf8": "",
        "authenticity_token": authenticity_token,
        "login": "hdjasbfsas",
        "password":"yy9797910",
    }
    
    res = session.post('https://github.com/session', headers=headers, data=form_data)
    
    
    # print('pengsima' in res.text)
    print(res.status_code)
    # print(res.text)
    #
    with open('github.html', 'w', encoding='utf-8') as f:
        f.write(res.text)

    5.json格式反序化:

    import requests
    import json
    res = requests.get('https://www.toutiao.com/stream/widget/local_weather/city/')
    print(res.text)
    # jason反序列化的两种方式
    print(json.loads(res.text))
    print(res.json())

     补充:

    '''
    requests高级用法
    了解!
    '''
    
    import requests
    # SSL
    res = requests.get('https://www.xiaohuar.com/')
    print(res.text)
    
    # 改进一: 无视证书
    res = requests.get('https://www.xiaohuar.com/', verify=False)
    print(res.text)
    
    
    # 改进二:取消警告
    import urllib3
    urllib3.disable_warnings()
    res = requests.get('https://www.xiaohuar.com/', verify=False)
    print(res.text)
    
    # 改进三:添加证书
    import urllib3
    urllib3.disable_warnings()
    res = requests.get('https://www.xiaohuar.com/', verify=False,
                       cert=('/path/server.crt', '/path/key'))
    print(res.text)
    
    
    # 使用代理
    res = requests.get('https://www.baidu.com/', headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    },
                       # proxies={
                       #     'http': 'http://112.85.130.66:9999',
                       #     # 'https': 'https://112.85.130.66:9999',
                       # }
                       proxies={
                           'sock': 'sock://ip:port'
                       })
    
    print(res.text)
    
    
    # 超时设置(超时报错)
    import requests
    respone=requests.get('https://www.baidu.com',
                         timeout=0.0001)
    print(respone.text)
    
    # 认证
    import requests
    from requests.auth import HTTPBasicAuth
    r=requests.get('xxx', auth=HTTPBasicAuth('user','password'))
    print(r.status_code)
    
    # 上传文件
    
    import requests
    files = {'file': open('a.jpg', 'rb')}
    response = requests.post('http://httpbin.org/post', files=files)
    print(response.status_code)
  • 相关阅读:
    xshell的安装及连接linux的使用方法
    linux中yum install 命令无效
    linux-centOS环境下安装jdk8
    centOS不显示ipv4地址的解决办法
    centOS开启和关闭防火墙
    java-分布式-索引
    java-网络通信-索引
    java-中间件
    java-框架-索引
    JVM-索引
  • 原文地址:https://www.cnblogs.com/sima-3/p/11074946.html
Copyright © 2011-2022 走看看