zoukankan      html  css  js  c++  java
  • scrapy--dbmeinv

      第一次将自己的爬虫文件与大家分享.豆瓣美女网页图片爬取.比较简单,但很实用.给大家提供思路为主,增强个人的爬虫能力.希望能帮助到大家!!!

    好了,让我们进入正题。

    先给大家看下成果!!!激励大家赶快行动起来

    1.爬虫文件:Dbmeinv.py

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from dbmeinv.items import DbmeinvItem
    import re
    import pdb
    
    
    class DbmeinvSpider(scrapy.Spider):
        name = 'Dbmeinv'
        allowed_domains = ['www.dbmeinv.com']
        start_urls = ['https://www.dbmeinv.com/index.htm?cid=6',
                      'https://www.dbmeinv.com/index.htm?cid=7',
                      'https://www.dbmeinv.com/index.htm?cid=3']
    
        def parse(self, response):
            #提取本页url
            le = LinkExtractor(restrict_css='ul.thumbnails')
    
            for link in le.extract_links(response):
                yield scrapy.Request(link.url,callback=self.parse_images)
            
            #提取下一页url
            le1 = LinkExtractor(restrict_css='li.next_page')
            link1 = le1.extract_links(response)
    
            if link1:
                yield scrapy.Request(link1[0].url,callback=self.parse)
    
        def parse_images(self,response):
            meinv = DbmeinvItem()
            
            #这个网页比较特殊,图片的src所在的不同标签分3种情况,具体可以根据scrapy爬虫中遇到的error,到相应网页中查找到图片的src所在的不同标签
            if response.xpath('//div[@class="image-wrapper"]/img/@src').extract():
                url1 = response.xpath('//div[@class="image-wrapper"]/img/@src').extract()[0]
                meinv['images_url'] = url1
                image_name = re.findall(r'large/(.+?.jpg)',url1)
                meinv['images'] = image_name[0]
    
            if response.xpath('//div[@class="panel-body markdown"]//img/@src'):
                url2 = response.xpath('//div[@class="panel-body markdown"]//img/@src').extract()[0]
                meinv['images_url'] = url2
                image_name = re.findall(r'large/(.+?.jpg)', url2)
                meinv['images'] = image_name[0]
    
            if response.xpath('//div[@class="topic-detail panel panel-default"]//img/@src'):
                url3 = response.xpath('//div[@class="topic-detail panel panel-default"]//img/@src').extract()[1]
                meinv['images_url'] = url3
                image_name = re.findall(r'large/(.+?.jpg)', url3)
                meinv['images'] = image_name[0]
    
            yield meinv    

    2.items.py

    import scrapy
    
    class DbmeinvItem(scrapy.Item):
    
        images_url = scrapy.Field()
        images     = scrapy.Field()

    3.pipelines.py

    import scrapy
    from scrapy.pipelines.images import ImagesPipeline  #ImagesPipeline直接提取存储图片
    from scrapy.exceptions import DropItem
    from dbmeinv.items import DbmeinvItem
    import pdb  
    
    class DbmeinvPipeline(ImagesPipeline):
        def get_media_requests(self,item,info):      #根据images_url相对应的src,进行下载
            yield scrapy.Request(item['images_url'])
    
        def item_completed(self,results,item,info):    #检查是否存储成功
            images_paths = [x['path'] for ok,x in results if ok]
    
            if not images_paths:
                raise DropItem("Item contains no images")
    
            return item
    
    
    class DuplicatesPipeline(object):            #这个类是根据图片名称,去重作用
        def __init__(self):
            self.ids_seen = set()
    
        def process_item(self, item, spider):
            if item['images'] in self.ids_seen:
                raise DropItem("Duplicate item found: %s" % item)
            else:
                self.ids_seen.add(item['images'])
                return item

    4.settings.py

    IMAGES_STORE = r'C:UsersDesktopdbmeinv'     #图片存储文件名
    
    USER_AGENT ={       #设置浏览器的User_agent,避免ERROR 403
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    }
    
    CONCURRENT_REQUESTS = 16    #同时来16个请求
    DOWNLOAD_DELAY = 0.2        #0.2s后开启处理第一个请求
    ROBOTSTXT_OBEY = False    #不遵守robots.txt
    COOKIES_ENABLED = False   #禁用COOKIES
    
    
    ITEM_PIPELINES = {      #设置启动顺序,1-1000,数字越小,优先级越高
    'dbmeinv.pipelines.DbmeinvPipeline': 1,
    'dbmeinv.pipelines.DuplicatesPipeline':200,
    }

    如果有遇到问题,欢迎来提问!!!大家一起进步

  • 相关阅读:
    31天重构指南之二十:提取子类
    31天重构指南之二十二:分解方法
    大叔手记(17):大叔2011年读过的书及2012年即将要读的书
    深入理解JavaScript系列(5):强大的原型和原型链
    深入理解JavaScript系列(10):JavaScript核心(晋级高手必读篇)
    深入理解JavaScript系列(11):执行上下文(Execution Contexts)
    深入理解JavaScript系列(8):S.O.L.I.D五大原则之里氏替换原则LSP
    深入理解JavaScript系列(4):立即调用的函数表达式
    深入理解JavaScript系列(3):全面解析Module模式
    深入理解JavaScript系列(7):S.O.L.I.D五大原则之开闭原则OCP
  • 原文地址:https://www.cnblogs.com/eilinge/p/9401348.html
Copyright © 2011-2022 走看看