zoukankan      html  css  js  c++  java
  • python 爬虫一

    爬虫分类

    通用      聚焦        增量式

    2.什么事UA检测,如何破解?

    服务器通过获取请求,获取请求头中的UA,判断UA的值,请求的载体身份标识  给个伪headers

    3.简述https的加密流程

    4.什么是动态加载数据?如何爬取动态加载的数据?

    有的网站 部分数据使用ajax生成动态数据,所见非所得,使用抓包工具进行分析,获取参数发送请求,得到数据.

    5.requests模块中的get和post方法的常用参数及其作用

    url  data headers  proxies(代理) 

    加密方式

    对称加密: 客户端和服务区端交互的时候,客户端制定加密规则,把解密规则和密文给我们服务器端 服务器就可以解密,坏处,一旦被第3放拦截就能破解

    非对称机密:服务端创建加密/解密(公钥/私钥)方式,把公钥给客户端 客户端使用公钥加密 把密文发个服务器 效率低

    https证书机制:客户端服务器端 找一个信任的3方机构 服务器端想把公钥给客户端,发之前先找第3方认证机构 进行签名,会有一个证书,结合公钥一并发给客户端,客户端会坚持这个公钥是不是3方认证机构签的如果是 就可以拿这个公钥进行加密 

    requests模块使用流程:

    1.指定url

    2.发送请求

    3.获取相应数据

    4.持久化存储

    爬取搜狗页面数据

    #1指定url
    url = 'https://www.sogou.com/'
    #2.发起请求
    response = requests.get(url=url)
    #3获取响应数据
    page_text = response.text #text返回的是字符串类型的数据
    #持久化存储
    with open('./sogou.html','w',encoding='utf-8') as fp:
        fp.write(page_text)
    print('over!')

    反反扒机制

    import requests
    wd = input('enter a word:')
    url = 'https://www.sogou.com/web'
    #参数的封装
    param = {
        'query':wd
    }
    #UA伪装
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    response = requests.get(url=url,params=param,headers=headers)
    #手动修改响应数据的编码
    response.encoding = 'utf-8'
    page_text = response.text
    fileName = wd + '.html'
    with open(fileName,'w',encoding='utf-8') as fp:
        fp.write(page_text)
    print(fileName,'爬取成功!!!')

    破解百度翻译

    import requests
    wd = input('enter a word:')
    url = 'https://www.sogou.com/web'
    #参数的封装
    param = {
        'query':wd
    }
    #UA伪装
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    response = requests.get(url=url,params=param,headers=headers)
    #手动修改响应数据的编码
    response.encoding = 'utf-8'
    page_text = response.text
    fileName = wd + '.html'
    with open(fileName,'w',encoding='utf-8') as fp:
        fp.write(page_text)
    print(fileName,'爬取成功!!!')

    爬取任意城市对应肯德基的位子

    #动态加载的数据
    city = input('enter a cityName:')
    url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
    data = {
        "cname": "",
        "pid": "",
        "keyword": city,
        "pageIndex": "2",
        "pageSize": "10",
    }
    #UA伪装
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    response = requests.post(url=url,headers=headers,data=data)
    
    json_text = response.text
    
    print(json_text)

    分页肯德基

    import json
    
    import requests
    
    #爬取任意城市对应的肯德基餐厅的位置信息
    #动态加载的数据
    city = input('enter a cityName:')
    for i in range(1,9):
        url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
        data = {
            "cname": "",
            "pid": "",
            "keyword": city,
            "pageIndex": i,
            "pageSize": "10",
        }
        #UA伪装
        headers = {
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
        }
        response = requests.post(url=url,headers=headers,data=data)
        json_text=response.text
        # data_dump = json.dumps(json_text)
        with open('data.json',"a", encoding="UTF-8") as f:
            f.write(json_text)

     抓取国家药品监督管理局 公司

    #注意事项 如果是动态数据就需要全局搜索确认找逻辑
    import
    requests headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } first_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsList' ids = [] for page in range(1,11): data = { "on": "true", "page": str(page), "pageSize": "15", "productName": "", "conditionType": "1", "applyname": "", "applysn": "", } response = requests.post(url=first_url,data=data,headers=headers) #response.headers返回的是响应头信息(字典) if response.headers['Content-Type'] == 'application/json;charset=UTF-8': json_obj = response.json() for dic in json_obj['list']: ids.append(dic['ID']) detail_url = 'http://125.35.6.84:81/xk/itownet/portalAction.do?method=getXkzsById' for _id in ids: data = { 'id':_id } company_text = requests.post(detail_url,data=data,headers=headers).text print(company_text)

    爬取图片的2种方法

    import  requests
    headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
    }
    img_url='https://www.baidu.com/img/bd_logo1.png'
    img_data=requests.get(url=img_url,headers=headers).content
    with open('./baidu_log.jpg','wb') as f:
        f.write(img_data)
    
    #######方法二
    from urllib import  request
    img_url='https://www.baidu.com/img/bd_logo1.png'
    request.urlretrieve(img_url,'./baidu_log2.jpg')

    正则数据解析

    解析原理: 标签定位,提取标签中存储的文本数据,或标签属性中的数据

    爬取糗事百科正则首页所有图

    pip install requests

    '''
    <div class="thumb">
    
    <a href="/article/121859578" target="_blank">
    <img src="//pic.qiushibaike.com/system/pictures/12185/121859578/medium/YZQA73IAY8J68GXC.jpg" alt="麻烦p的搞笑一点">
    </a>
    
    </div>
    '''
    import os
    import re
    import requests
    from  urllib import  request
    if not os.path.exists('./qiutu'):
        os.mkdir('./qiutu')
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    url='https://www.qiushibaike.com/pic/'
    page_text=requests.get(url=url,headers=headers).text
    
    ex='<div class="thumb">.*?<img src="(.*?)" alt.*?</div>'
    img_url=re.findall(ex,page_text,re.S)#re.S 去掉空格
    for url in img_url:
        url='https:'+url
        img_name=url.split('/')[-1]
        img_path='./qiutu/'+img_name
        request.urlretrieve(url,img_path)
        print(img_name,'下载成功')

    bs4解析

    解析原理:实例化一个Beautifulsoup(谁又福)的对象,将页面源码数据加载到该对象中

    使用该对象的相关属性和方法实现标签和数据提取

    • pip install bs4
    • pip install lxml
    • pip install  html5lib

    2种方式

    BeautifulSoup(page_test,'lmxl')#从互联网请求到数据源码加载到对象中
    BeautifulSoup(fb,'lmxl')#将本地源码加载在对象中
    好处就是 自带标签
    from bs4 import BeautifulSoup
    fp = open('./test.html','r',encoding='utf-8')
    soup=BeautifulSoup(fp,"lxml")
    # print(soup.title)
    # print(soup.div)#默认会找第一个div
    # print(soup.find('a'))#查询a 默认第一个
    # #属性定位 # print(soup.find('div',class_='song')) # print(soup.find_all('div')[2])#查找所有div 并找出第二个div 从0开始的 #select(选择器) # print(soup.select('.song')) # print(soup.select('div'))#变成一个list

    #层级 # >表示一个层级 空格表示多个层级 # print(soup.select('.tang > ul > li >a'))#取出所有的a # print(soup.select('.tang a'))#取出所有的a #取出直系文本数据 text获取全部的数据 # print(soup.p.string) # print(soup.find('div',class_='tang').get_text()) # print(soup.find('div',class_='tang').text)

    #取属性 # print(soup.a['href']) # print(soup.select('.tang>ul>li>a')[0]['href'])

    xpath解析

    解析原理:实例一个etree对象,将页面源码加载该对象中,使用etrr中的xpath方法结合xpath表达式进行标签定位和数据提取
    2种方式
    etree.parse('本地文件路径')
    etrss.Html(page_text)#远程文件
    from lxml import etree
    tree=etree.parse('./test.html')
    
    #定位title标签
    # print(tree.xpath('/html/head/title/text()'))#查找title
    # print(tree.xpath('/html//title'))
    # print(tree.xpath('//title/text()'))
    #定位class
    print(tree.xpath('//div[@class="song"]/p[1]/text()')[0])
    print(tree.xpath('//div[@class="tang"]/ul/li[4]/a/text()'))
    #定位id
    print(tree.xpath('//div[@class="id"]/ul/li[4]/a/text()'))
    #取属性
    print(tree.xpath('//a/@title'))#找到所有的title属性 遇到属性取属性
    print(tree.xpath('//a/@href'))#找到所有的hraf属性

    中文乱码问题

    import requests
    from lxml import etree
    start_page = int(input('start page num:'))
    end_page = int(input('end page num:'))
    
    if not os.path.exists('./meinvs'):
        os.mkdir('./meinvs')
    
    #通用的url模板(不能修改)
    url = 'http://pic.netbian.com/4kmeinv/index_%d.html'
    for page in range(start_page,end_page+1):
        if page == 1:
            new_url = 'http://pic.netbian.com/4kmeinv/'
        else:
            new_url = format(url%page)
        response = requests.get(url=new_url,headers=headers)
    #     response.encoding = 'utf-8'
        page_text = response.text
        #解析名称和图片的src属性值
        tree = etree.HTML(page_text)
        li_list = tree.xpath('//div[@class="slist"]/ul/li')
        for li in li_list:
            img_name = li.xpath('./a/img/@alt')[0]
            img_name = img_name.encode('iso-8859-1').decode('gbk')+'.jpg'
            img_src = 'http://pic.netbian.com'+li.xpath('./a/img/@src')[0]
            img_path = './meinvs/'+img_name
            request.urlretrieve(img_src,img_path)
            print(img_name,'下载成功!!!')
            

    xpath 或的使用

    #爬取全国城市名称
    url = 'https://www.aqistudy.cn/historydata/'
    page_text = requests.get(url=url,headers=headers).text
    
    tree = etree.HTML(page_text)
    # hot_city = tree.xpath('//div[@class="bottom"]/ul/li/a/text()')
    # all_city = tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text()')
    # all_city
    
    tree.xpath('//div[@class="bottom"]/ul/div[2]/li/a/text() | //div[@class="bottom"]/ul/li/a/text()')

    智联职位爬取

    import requests
    from lxml import etree
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }
    url = 'https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&city=101010100&industry=&position='
    page_text = requests.get(url=url, headers=headers).text
    # 数据解析
    tree = etree.HTML(page_text)
    list = tree.xpath('//div[@class="job-list"]//li')
    for i in list:
        position = i.xpath('.//div[@class="job-title"]/text()')[0]
        salary = i.xpath('.//span[@class="red"]/text()')[0]
        gongsi = i.xpath('.//div[@class="company-text"]/h3/a/text()')[0]
    
        url_tail = i.xpath('.//div[@class="info-primary"]//a/@href')[0]
        print(url_tail)
        url_tail = 'https://www.zhipin.com/' + url_tail
        page_text_tail=requests.get(url=url_tail,headers=headers).text
        tree2 = etree.HTML(page_text_tail)
        maiosu_list=tree2.xpath('//div[@class="detail-content"]')
        for v in maiosu_list:
            a=v.xpath('.//div[@class="job-sec"]/div[@class="text"]/text()')
            print(position,salary,gongsi,a)

     requests模块高级操作

    - 匿名度:
        - 透明:对方服务器可以知道你使用了代理,并且也知道你的真实IP
        - 匿名:对方服务器可以知道你使用了代理,但不知道你的真实IP
        - 高匿:对方服务器不知道你使用了代理,更不知道你的真实IP。

    - 类型:
        - http:该类型的代理ip只可以发起http协议头对应的请求
        - https:该类型的代理ip只可以发起https协议头对应的请求

    requests的get和post方法常用的参数:
      url
      headers
      data/params post用data
    proxies 代理
    Connection:close #来一个连接关闭一个

    免费获取代理的方法

    import os
    import requests
    from lxml import etree
    # start_page=int(input('start page num:'))
    # end_page=int(input('end page num:'))
    
    if not  os.path.exists('./daili'):
        os.mkdir('./daili')
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    for i in range(1,3):
        url = 'https://www.xicidaili.com/nn/{}'.format(i)
        response=requests.get(url=url,headers=headers).text
        #实例化
        tree=etree.HTML(response)
        tr_list=tree.xpath('//*[@id="ip_list"]//tr[@class="odd"]')
        # print(tr_list)
        for tr in tr_list:
            one_ip=tr.xpath('.//td[2]/text()')[0]
            port=tr.xpath('.//td[3]/text()')[0]
            list_wr=one_ip+':'+port
            # print(list_wr)
            with open('./ip.txt','a') as f:
                f.write(list_wr+'
    ')

    使用代理的方法

    import random
    import requests
    https=[
        {'https':"122.193.244.58:9999"},
    ]
    http = [
        {'http':"101.132.131.158:8118"},
        {'http':"120.210.219.101:8080"}
    ]
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    url = 'https://www.baidu.com/s?wd=ip'
    
    if url.split(':')[0] == 'https':
        page_text = requests.get(url=url,headers=headers,proxies=random.choice(https)).text
        print(page_text)
    else:
         page_text = requests.get(url=url,headers=headers,proxies=random.choice(http)).text
    
    with open('./ip.html','w',encoding='utf-8') as fp:
        fp.write(page_text)

    cookie相关操作

    - cookie:可是使得服务器端记录客户端的相关状态
    -处理cookie的方式
    -手动处理 cookie是有效时常,动态变化的
    -自动处理 使用会发机制session
    -session用法:
    实例化一个会话对象:requests.Session()
    可以进行请求发送(post,get)
    请求过程如果产生了cookie就会被自动存储到session中

    爬取雪球

    #需求:爬取雪球网中的新闻标题和对应的内容简介
    url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1'
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    json_obj = requests.get(url=url,headers=headers).json()
    print(json_obj)
    {'error_description': '遇到错误,请刷新页面或者重新登录帐号后再试', 'error_uri': '/v4/statuses/public_timeline_by_category.json', 'error_code': '400016'}

    加session的方法

    import requests
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    session=requests.Session()
    #如果这一步产生了cookie,则cookie会被自动存储到session中
    session.get(url='https://xueqiu.com/',headers=headers)
    #想要对如下的url发送请求,且是携带cookie发动请求
    url='https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&max_id=-1&count=10&category=-1'
             #把这里的requests改成session
    json_obj = session.get(url=url,headers=headers).json()
    print(json_obj)

    云打码使用流程

    • http://www.yundama.com/demo.html
    • 注册:
      • 普通用户
      • 开发者用户
    • 登录:
      • 登录普通用户:
        • 查询剩余题分
      • 登录开发者用户:
        • 创建一个软件:我的软件-》创建一个新软件(软件名称,秘钥不可以修改),使用软件的id和秘钥
        • 下载示例代码:开发文档-》点此下载:云打码接口DLL-》PythonHTTP示例下载

    爬取古诗文(自动打码)

    import http.client, mimetypes, urllib, json, time, requests
    ######################################################################
    class YDMHttp:
        apiurl = 'http://api.yundama.com/api.php'
        username = ''
        password = ''
        appid = ''
        appkey = ''
    
        def __init__(self, username, password, appid, appkey):
            self.username = username
            self.password = password
            self.appid = str(appid)
            self.appkey = appkey
    
        def request(self, fields, files=[]):
            response = self.post_url(self.apiurl, fields, files)
            response = json.loads(response)
            return response
    
        def balance(self):
            data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey}
            response = self.request(data)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['balance']
            else:
                return -9001
    
        def login(self):
            data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey}
            response = self.request(data)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['uid']
            else:
                return -9001
    
        def upload(self, filename, codetype, timeout):
            data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
            file = {'file': filename}
            response = self.request(data, file)
            if (response):
                if (response['ret'] and response['ret'] < 0):
                    return response['ret']
                else:
                    return response['cid']
            else:
                return -9001
    
        def result(self, cid):
            data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey, 'cid': str(cid)}
            response = self.request(data)
            return response and response['text'] or ''
    
        def decode(self, filename, codetype, timeout):
            cid = self.upload(filename, codetype, timeout)
            if (cid > 0):
                for i in range(0, timeout):
                    result = self.result(cid)
                    if (result != ''):
                        return cid, result
                    else:
                        time.sleep(1)
                return -3003, ''
            else:
                return cid, ''
    
        def report(self, cid):
            data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid,
                    'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
            response = self.request(data)
            if (response):
                return response['ret']
            else:
                return -9001
    
        def post_url(self, url, fields, files=[]):
            for key in files:
                files[key] = open(files[key], 'rb');
            res = requests.post(url, files=files, data=fields)
            return res.text
    ######################开始###########################################
    # 将示例代码中的可执行程序封装成函数
    def transformCodeImg(imgPath, imgType):
        # 普通用户名
        username = 'bobo328410948'
        # 密码
        password = 'bobo328410948'
    
        # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
        appid = 6003
    
        # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
        appkey = '1f4b564483ae5c907a1d34f8e2f2776c'
    
        # 图片文件
        filename = imgPath
    
        # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
        codetype = imgType
    
        # 超时时间,秒
        timeout = 30
        result = None
        # 检查
        if (username == 'username'):
            print('请设置好相关参数再测试')
        else:
            # 初始化
            yundama = YDMHttp(username, password, appid, appkey)
    
            # 登陆云打码
            uid = yundama.login();
            print('uid: %s' % uid)
    
            # 查询余额
            balance = yundama.balance();
            print('balance: %s' % balance)
    
            # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
            cid, result = yundama.decode(filename, codetype, timeout);
    
        return result
    
    
    import requests
    from lxml import etree
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    #访问的时候带了cookie
    s=requests.Session()
    url='https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx'
    page_text=s.get(url=url,headers=headers).text
    tree=etree.HTML(page_text)
    #验证码图片地址
    img_src='https://so.gushiwen.org/'+tree.xpath('//*[@id="imgCode"]/@src')[0]
    #获取到的图片二进制文件写入
    img_data=s.get(url=img_src,headers=headers).content##验证码也会产生一个cookie
    with open('./gushiwen.jpg','wb') as f:
        f.write(img_data)
    
                                #验证码图片      类型
    result=transformCodeImg('./gushiwen.jpg',1004)
    print(result,'打印出打码后的验证码')
    
    #登陆的时候发送的值
    __VIEWSTATE=tree.xpath('//*[@id="__VIEWSTATE"]/@value')[0]
    __VIEWSTATEGENERATOR=tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')[0]
    
    #模拟登陆
    post_url='https://so.gushiwen.org/user/login.aspx?from='
    data={
    "__VIEWSTATE":__VIEWSTATE,
    "__VIEWSTATEGENERATOR":__VIEWSTATEGENERATOR,
    "from":"",
    "email": "1820405927@qq.com",
    "pwd": "1213.com",
    "code": result,
    "denglu": "登录"
    }
    response=s.post(url=post_url,headers=headers,data=data)
    print(response.status_code)#登陆后的状态
    page_text=response.text#得到登陆后的主页写入html文件
    with open('./gushiwen.html','w',encoding='utf-8') as f:
        f.write(page_text)

    线程池使用测试方法

    from time import sleep
    import time
    from multiprocessing.dummy import Pool
    urls=['www.baidu.com','www.songou.com','www.xinlang.com']
    def request(url):
        print('正在请求:',url)
        sleep(2)
        print('下载成功',url)
    start=time.time()
    
    pool=Pool(3)
    pool.map(request,urls)
    print(time.time()-start)

    线程池爬取梨视频中的短数据

      *线程池需要作用到 爬虫为最耗时的操作中

      耗时操作:视频下载,视频的保存

    # 使用线程池爬取视频中的短视频
    from lxml import etree
    
    import requests
    import random
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
    }
    url = 'https://www.pearvideo.com/category_1'
    page_text = requests.get(url=url, headers=headers).text
    
    from multiprocessing.dummy import Pool
    pool = Pool(4)
    viseo_urls = []  # 所有视频的url
    tree = etree.HTML(page_text)
    
    # 解析视频详情url
    li_list = tree.xpath('//*[@id="listvideoListUl"]/li')
    '''
    var contId="1559965",liveStatusUrl="liveStatus.jsp",liveSta="",playSta="1",autoPlay=!1,isLiving=!1,isVrVideo=!1,hdflvUrl="",sdflvUrl="",hdUrl="",sdUrl="",ldUrl="",
    srcUrl="https://video.pearvideo.com/mp4/adshort/20190528/cont-1559965-13958439_adpkg-ad_hd.mp4",vdoUrl=srcUrl,skinRes="//www.pearvideo.com/domain/skin",videoCDN="//video.pearvideo.com";
    ex='srcUrl="(.*?)",vdoUrl'
    '''
    import re
    
    def getiVideoData(url):
        return requests.get(url=url, headers=headers).content
    #进行随机保存
    def saveVido(data):
        name=str(random.randint(0,9999))+'.mp4'
        with open(name,'wb') as f:
            f.write(data)
        print(name,'下载成功')
    for li in li_list:
        detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
        detail_page_text = requests.get(url=detail_url, headers=headers).text
        ex = 'srcUrl="(.*?)",vdoUrl'
        video_src = re.findall(ex, detail_page_text, re.S)[0]  # 正则获取视屏url
        viseo_urls.append(video_src)
    print(viseo_urls)
    #使用线程池进行视频数据的异步下载
    all_video_data_list=pool.map(getiVideoData, viseo_urls)
    #保存视频
    pool.map(saveVido,all_video_data_list)
  • 相关阅读:
    Java多线程之赛跑游戏(含生成exe文件)
    JavaSE之绘制菱形
    JavaSE项目之员工收录系统
    深度解析continue,break和return
    如何查看yum安装路径
    转载 linux umount 时出现device is busy 的处理方法--fuser
    linux安装扩展总结
    linux 编译安装amqp
    vmware 实现linux目录映射window本地目录
    yaf学习之——生成yaf示例框架
  • 原文地址:https://www.cnblogs.com/zaizai1573/p/10934297.html
Copyright © 2011-2022 走看看