zoukankan      html  css  js  c++  java
  • Python的scrapy之爬取妹子图片

    闲来无事,做的一个小爬虫项目

    爬虫主程序:

     1 import scrapy
     2 from ..items import MeiziItem
     3 
     4 class MztSpider(scrapy.Spider):
     5     name = 'mzt'
     6     allowed_domains = ['meizitu.com']
     7     start_urls = ['http://meizitu.com/']
     8 
     9     def parse(self, response):
    10         tags = response.xpath(".//*[@class='tags']/span/a")
    11         for i in tags:
    12             item = MeiziItem()
    13             tag_href = i.xpath(".//@href").extract()[0]
    14             tag_name = i.xpath(".//@title").extract()[0]
    15             item['tag_name'] = tag_name
    16             item['tag_href'] = tag_href
    17             #print(item['tag_name'])
    18             #yield item
    19             yield scrapy.Request(url=item['tag_href'], meta={'item': item}, callback=self.parse_page)
    20 
    21     def parse_page(self, response):
    22 
    23         item = response.meta['item']
    24         # 进入某个标签后,爬取底部分页按钮
    25         page_lists = response.xpath(".//*[@id='wp_page_numbers']/ul/li")
    26         # 获取底部分页按钮上的文字,根据文字来判断当前标签页下总共有多少分页
    27         page_list = page_lists.xpath('.//text()')
    28         # 如果当前标签页下有多个页面,则再根据第一个按钮是否为“首页”来进行再次提取,因为这里有的页面第一个按钮是首页,有的第一个按钮是“1”
    29         if len(page_lists) > 0:
    30             if page_list[0].extract() == '首页':
    31                 page_num = len(page_lists) - 3
    32             else:
    33                 page_num = len(page_lists) - 2
    34         else:
    35             page_num = 1
    36 
    37         # 根据当前标签页的链接,来拼成带页码的链接
    38         if '_' in item['tag_href']:
    39             index = item['tag_href'][::-1].index('_')
    40             href_pre = item['tag_href'][:-index]
    41         else:
    42             if page_num == 1:
    43                 href_pre = item['tag_href'].split('.html')[0]
    44             else:
    45                 href_pre = item['tag_href'].split('.html')[0] + '_'
    46         for i in range(1, page_num + 1):
    47             item = response.meta['item']
    48             if page_num == 1:
    49                 href = href_pre + '.html'
    50             else:
    51                 href = href_pre + str(i) + '.html'
    52             item['page_list'] = href
    53             #yield item
    54             yield scrapy.Request(url=item['page_list'], meta={'item': item}, callback=self.parse_album)
    55 
    56     def parse_album(self, response):
    57         albums = response.xpath(".//*[@class='pic']")
    58         for album in albums:
    59             item = response.meta['item']
    60             album_href = album.xpath(".//a/@href").extract()[0]
    61             album_name = album.xpath(".//a/img/@alt").extract()[0]
    62             item['album_name'] = album_name
    63             item['album_href'] = album_href
    64             #yield item
    65             yield scrapy.Request(url=item['album_href'], meta={'item': item}, callback=self.parse_img)
    66 
    67     def parse_img(self, response):
    68         img_list = response.xpath(".//*/p/img")
    69         for img in img_list:
    70             item = response.meta['item']
    71             img_title = img.xpath(".//@alt").extract()[0]
    72             if img_title == '':
    73                 for i in range(1, len(img_list + 1)):
    74                     img_title = item['album_name'] + '_' + str(i)
    75             else:
    76                 img_title = img_title
    77             img_urls = img.xpath(".//@src").extract()
    78             img_src = img.xpath(".//@src").extract()[0]
    79             item['img_title'] = img_title
    80             item['img_src'] = img_src
    81             item['img_urls'] = img_urls
    82             yield item

    items设置

     1 import scrapy
     2 
     3 
     4 class MeiziItem(scrapy.Item):
     5     # define the fields for your item here like:
     6     # name = scrapy.Field()
     7     # 标签名称
     8     tag_name = scrapy.Field()
     9     # 标签链接
    10     tag_href = scrapy.Field()
    11     # 进入某标签后的所有链接,加页码的
    12     page_list = scrapy.Field()
    13     # 图片专辑名称
    14     album_name = scrapy.Field()
    15     # 图片专辑链接
    16     album_href = scrapy.Field()
    17     # 照片标题
    18     img_title = scrapy.Field()
    19     # 照片链接
    20     img_src = scrapy.Field()
    21     # 照片链接集合,用于ImagesPipeline下载图片
    22     img_urls = scrapy.Field()

    输出管道:

    print('正在爬取...')
    print('老湿机,请耐心等待哟...')
    class MeiziPipeline(object):
        def process_item(self, item, spider):
            print('标签名称:',item['tag_name'])
            print('标签链接:',item['tag_href'])
            print('页码:',item['page_list'])
            print('图片专辑名称:',item['album_name'])
            print('图片专辑链接:',item['album_href'])
            print('照片标题:',item['img_title'])
            print('照片链接:',item['img_src'])
            print('照片链接集合:',item['img_urls'])
            print('----------------')
            return item

    保存到本地的管道:

    import scrapy
    from scrapy.pipelines.images import ImagesPipeline
    from scrapy.exceptions import DropItem
    
    
    class MztImagesPipeline(ImagesPipeline):
        def get_media_requests(self, item, info):
            for image_url in item['img_urls']:
                yield scrapy.Request(image_url)
    
        def item_completed(self, results, item, info):
            image_paths = [x['path'] for ok, x in results if ok]
            if not image_paths:
                raise DropItem("该Item没有图片")
            return item

    setting的设置,往setting.py加入

    BOT_NAME = 'meizi'
    
    SPIDER_MODULES = ['meizi.spiders']
    NEWSPIDER_MODULE = 'meizi.spiders'
    
    IMAGES_STORE = r'G:\mzt'    # 图片存储路径
    IMAGES_EXPIRES = 90             # 过期天数
    IMAGES_MIN_HEIGHT = 100         # 图片的最小高度
    IMAGES_MIN_WIDTH = 100          # 图片的最小宽度

    爬取的最终结果

    本来想把这些图片分门分类的保存,然而不太会,所有的图片全保存在一个文件夹下面,,,



  • 相关阅读:
    C#调用C++ memcpy实现各种参数类型的内存拷贝 VS marshal.copy的实现 效率对比
    UGUI 事件穿透规则
    UGUI 锚点设置为四方扩充模式然后设置局部坐标为0将出现什么问题
    UNITY polygon collider不随物体旋转
    android Handler机制 消息机制
    java final finally finalize
    collection 和 collections
    java 不通过第三个字符串,实现一个字符串倒序
    ArrayList,Vector,LinkedList
    String StringBuffer stringbuilder 区别
  • 原文地址:https://www.cnblogs.com/yuxuanlian/p/9773318.html
Copyright © 2011-2022 走看看