zoukankan      html  css  js  c++  java
  • 爬虫系列---多线程爬取实例

    1.爬取站长图片源码

    #爬取站长'http://sc.chinaz.com/tupian/gudianmeinvtupian.html',所有的古典美女图片
    import os
    import time
    import random
    import requests
    from lxml import etree
    from multiprocessing.dummy import Pool
    #获取所有页面的url
    url ='http://sc.chinaz.com/tupian/gudianmeinvtupian.html'
    page_url_list=[f'http://sc.chinaz.com/tupian/gudianmeinvtupian_{i}.html' for i in range(2,7)]
    page_url_list.insert(0,url)
    
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.20 Safari/537.36',
        # 'Content-Encoding':'gzip',
        # 'Content-Type': 'text/html',
    }
    pig_url_list = []
    def get_pig_url(url):
        response = requests.get(url=url, headers=headers)
        #xpath解析数据
        tree = etree.HTML(response.content.decode())
        div_list = tree.xpath('//div[@id="container"]/div')
        for div in div_list:
            url = div.xpath('.//img/@src2')[0]
            pig_url_list.append(url)
    
    def download(url):
        '''下载图片数据'''
        return requests.get(url=url,headers=headers).content
    
    def save_pig(data):
        '''保存图片'''
        # name=url.split('/')[-1]
        name=str(random.randrange(0,1000000))+'.jpg' #线程存储文件名需改善
        path='zhanzhangpig/'+name
        with open(path,'wb') as f:
            f.write(data)
    
    if not os.path.exists('zhanzhangpig'):
        os.makedirs('zhanzhangpig')
    # 使用线程池
    print('多线程爬取开始')
    start_time=time.time()
    pool=Pool(8)
    pool.map(get_pig_url,page_url_list)
    data_list=pool.map(download,pig_url_list)
    pool.map(save_pig,data_list)
    #关闭线程池
    end_time=time.time()
    print('多线程爬取结束')
    print('耗时:',end_time-start_time)
    
    pool.close()
    pool.join()

    2 爬取妹子网图片(https://www.mzitu.com/tag/ugirls/)

    import os
    import time
    import random
    import requests
    from lxml import etree
    from multiprocessing.dummy import Pool
    session=requests.session()
    if not os.path.exists('meizitu'):
        os.makedirs('meizitu')
    
    url='https://www.mzitu.com/tag/ugirls/'
    page_url_list=[f'https://www.mzitu.com/tag/ugirls/page/{i}/' for i in range(2,17)]
    page_url_list.insert(0,url)
    
    headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
        'Upgrade-Insecure-Requests': '1',
        'Referer': 'https://www.mzitu.com/tag/ugirls/' # 反爬机制:需携带网页请求的原地址
    }
    pig_url_list = []
    def get_pig_url(url):
        response = session.get(url=url, headers=headers)
        # print(response.text)
        #xpath解析数据
        tree = etree.HTML(response.content.decode())
        div_list = tree.xpath('//ul[@id="pins"]/li')
        for div in div_list:
            url = div.xpath('.//img/@data-original')[0]
            pig_url_list.append(url)
    
    def download(url):
        '''下载图片数据'''
        # print(url)
        return session.get(url=url,headers=headers).content
    
    def save_pig(data):
        '''保存图片'''
        name=str(random.randrange(0,1000000))+'.jpg' #线程存储文件名需改善
        path='meizitu/'+name
        with open(path,'wb') as f:
            f.write(data)
    
    print('多线程爬取开始')
    start_time=time.time()
    #开启线程
    pool=Pool(10)
    # pig_url_list=get_pig_url(url=url) #单页爬取
    #多页爬取
    
    pool.map(get_pig_url,page_url_list)
    # print(pig_url_list)
    data_list=pool.map(download,pig_url_list)
    pool.map(save_pig,data_list)
    
    pool.close()
    pool.join()
    #关闭线程池
    end_time=time.time()
    print('多线程爬取结束')
    print('耗时:',end_time-start_time)
    #--------------------统计文件夹中文件个数-----------------
    print(len(os.listdir('./meizitu')))

    !!!384张美图等你拿

  • 相关阅读:
    js画线
    开源Math.NET基础数学类库使用(11)C#计算相关系数
    Cent OS5.2安装Hyper-V集成光盘
    解决oracle11g的ORA-12505问题
    Oracle11g安装出现em.ear
    Entity Framework Code First (八)迁移 Migrations
    Modernizr.js入门指南(HTML5&CSS3浏览器兼容插件)
    Waves:类Material Design 的圆形波浪(涟漪)点击特效插件
    多种css3时尚侧栏菜单展开显示效果Off-Canvas Menu Effects
    iOS 复选框风格转换 Switchery 开关效果
  • 原文地址:https://www.cnblogs.com/angle6-liu/p/10439624.html
Copyright © 2011-2022 走看看