zoukankan      html  css  js  c++  java
  • scrapy 图片爬取 多层多页 保存不同的文件夹 重命名full文件夹

    记录下整个爬虫代码,我已经把实验网站爬完了。。

    items.py

    1 import scrapy
    2 
    3 
    4 class DemoItem(scrapy.Item):
    5     # define the fields for your item here like:
    6     folder_name = scrapy.Field()  #选取页面里的主题或者标题作为文件夹名字,代替full文件夹
    7     #img_name = scrapy.Field()  # 提取的图片的名字,如果没有就不使用
    8     img_url = scrapy.Field()  # 图片的链接

    spider.py

     1 # -*- coding: utf-8 -*-
     2 import scrapy
     3 from demo.items import DemoItem
     4 
     5 
     6 class LogosSpider(scrapy.Spider):
     7     name = 'logos'
     8     allowed_domains = ['tttt8.net']
     9     #start_urls = ['http://www.tttt8.net/category/legbaby/']
    10     #start_urls = ['http://www.tttt8.net/category/ugirls/']
    11     #start_urls = ['http://www.tttt8.net/category/kelagirls/']
    12     start_urls = ['http://www.tttt8.net/category/xiurenwang/micatruisg/']
    13     #这里的page是用来构造一级页面翻页链接
    14     page = 1
    15 
    16     def parse(self, response):
    17         #提取所有专栏的列表
    18         li_list = response.xpath('//*[@id="post_container"]/li')
    19         for li in li_list:
    20             #实例化
    21             item = DemoItem()
    22             #这里只提取专栏的名字,用于后面储存文件夹的命名
    23             item['folder_name'] = li.xpath('./div[2]/h2/a/text()').extract_first()
    24             #提取二级页面的链接,准备给二级页面函数去工作
    25             next_plink = li.xpath('./div[1]/a/@href').extract_first()
    26             #item把一级页面实例化的内容传送给二级页面接收
    27             yield scrapy.Request(url = next_plink, callback = self.parse2, meta = {'item':item})
    28         # 一级页面的翻页列表/ 自行查找别的方法翻页,网上好多,我觉得这个构造更简单
    29         page_list = response.xpath('//div[@class="pagination"]/a/@href').extract()
    30         #找到最后一页的页码
    31         last_page = page_list[-1]
    32         #提取出最大的页码
    33         max_num = int(last_page[-2])
    34         #构造翻页的页码链接
    35         if self.page <= max_num:
    36             self.page += 1
    37             new_page_url = self.start_urls[0] + 'page/' + str(self.page) + '/'
    38             yield scrapy.Request(url = new_page_url, callback = self.parse)
    39 
    40     def parse2(self, response):
    41         #接收一级页面的内容,和翻页请求的衔接,不然翻页就丢失了,会报错
    42         item = response.meta['item']
    43         p_list = response.xpath('//*[@id="post_content"]/p/img')
    44         #正常的提取图片的链接给pepeline去下载
    45         for img in p_list:
    46             img_url = img.xpath('./@src').extract_first()
    47             # 这里必须加中括号[],图片下载的函数,要求是list类型
    48             item['img_url'] = [img_url]
    49             yield item
    50         # 二级页面的翻页列表,然后在后面yield请求
    51         next_page_list = response.xpath('//*[@id="content"]/div[1]/div[3]/div[2]/a/@href').extract()
    52         for next_page in next_page_list:
    53             #这里一定要加meta,不然二级的翻页就error,这里我搞了好久才发现。
    54             yield scrapy.Request(url = next_page, callback = self.parse2, meta = {'item':item})

    settings.py

     1 # -*- coding: utf-8 -*-
     2 
     3 
     4 BOT_NAME = 'demo'
     5 SPIDER_MODULES = ['demo.spiders']
     6 NEWSPIDER_MODULE = 'demo.spiders'
     7 
     8 #存储路径和header
     9 IMAGES_STORE = 'D:pics'
    10 USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
    11 
    12 DOWNLOAD_DELAY = 0.2
    13 #机器人关掉
    14 ROBOTSTXT_OBEY = False
    15 
    16 ITEM_PIPELINES = {
    17     'demo.pipelines.DemoPipeline': 300,
    18 }
    19 #下载指定的field
    20 IMAGES_URLS_FIELD = 'img_url'

    pipelines.py

     1 import scrapy
     2 from scrapy.exceptions import DropItem
     3 from scrapy.pipelines.images import ImagesPipeline
     4 
     5 
     6 class DemoPipeline(ImagesPipeline):
     7     # 固定的改写的函数,不需要修改
     8     def get_media_requests(self, item, info):
     9         for img_url in item['img_url']:
    10             # 下载完后给别的函数去改名字,所以用meta传下去
    11             yield scrapy.Request(img_url, meta = {'item':item})
    12 
    13     def file_path(self, request, response = None, info = None):
    14         item = request.meta['item']
    15         folder_name = item['folder_name']
    16         # img_name = item['img_name']  #图片没有名字不启用这个语句
    17         # 因为图片没有名字就用url截取最后的字符串作为名字
    18         image_guid = request.url.split('/')[-1]
    19         img_name = image_guid
    20         # name = img_name + image_guid
    21         # name = name + '.jpg'
    22         # 0 代表文件夹,1 代表文件
    23         filename = u'{0}/{1}'.format(folder_name, img_name)
    24         return filename
    25 
    26     # 固定改写的函数,不需要修改
    27     def item_completed(self, results, item, info):
    28         image_paths = [x['path'] for ok, x in results if ok]
    29         if not image_paths:
    30             raise DropItem('Image Downloaded Failed')
    31         return item

    结果:

     

  • 相关阅读:
    sharepoint更新
    生成Log日志文件.NET
    sharepoint绑定
    sharepoint多表查询
    数据库导入
    sharepoint插入数据
    协方差矩阵求解算法分析
    .NET提供的加密算法概述
    掩耳盗铃之使用WebBrowser封装网页
    C#委托BeginInvoke返回值乱序问题
  • 原文地址:https://www.cnblogs.com/passagain/p/11607711.html
Copyright © 2011-2022 走看看