zoukankan      html  css  js  c++  java
  • 线程池在爬虫案例中的应用

    import requests 
    from lxml import etree
    import re 
    from multiprocessing.dummy import Pool
    #需求:爬取梨视频的视频数据
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2626.106 Safari/537.36'
    }
    #原则:线程池处理的是阻塞且较为耗时的操作
    
    #对url发起请求,解析出视频详情页的url和名称
    url = 'https://www.pearvideo.com/category_5'
    page_text = requests.get(url=url,headers=headers).text
    
    tree = etree.HTML(page_text)
    li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
    urls = []   #存储所有视频的链接和名字
    for li in li_list:
        detail_url = 'https://www.pearvideo.com/'+li.xpath('./div/a/@href')[0]
        name = li.xpath('./div/a/div[2]/text()')[0]+'.mp4'
        # print(detail_url,name)
        #对详情页的url发起请求
        detail_page_text = requests.get(url=detail_url,headers=headers).text
        #从详情页中解析出视频的地址url
    
        ex = 'srcUrl="(.*?)",vdoUrl'
        video_url = re.findall(ex,detail_page_text)[0]
        dic = {
            'name':name,
            'url':video_url
        }
        urls.append(dic)
    #对视频链接发起请求获取二进制数据,然后将视频数据进行返回
    def get_video_data(dic):
        url = dic['url']
        print(dic['name'],'正在下载!')
        data = requests.get(url=url,headers=headers).content
        #持久化存储操作
        with open(dic['name'],'wb') as fp:
            fp.write(data)
            print(dic['name'],'下载成功!')
    #使用线程池对视频数据进行请求(较为耗时的阻塞操作)
    pool = Pool(4)
    pool.map(get_video_data,urls)
    
    pool.close()
    pool.join()
    
    
  • 相关阅读:
    rqnoj71 拔河比赛
    NOI2002 洛谷 P1196 银河英雄传说
    sdibt 1244 烦人的幻灯片
    POJ 1273 Drainage Ditches -dinic
    NOIP2005提高组 过河
    OpenJudge 7627 鸡蛋的硬度
    Openjudge 8782 乘积最大
    OpenJudge 7624 山区建小学
    UVa 1328 Period
    UVa 11384 Help is needed for Dexter
  • 原文地址:https://www.cnblogs.com/gerenboke/p/13389092.html
Copyright © 2011-2022 走看看