  • 七月在线爬虫班学习笔记(六)——scrapy爬虫整体示例


    • 爬豆瓣文本例程 douban
    • 图片例程 douban_imgs

    1.爬豆瓣文本例程 douban




    # This package will contain the spiders of your Scrapy project
    # Please refer to the documentation for information on how to create and manage
    # your spiders.


    # -*- coding:utf-8 -*-
    '''by sudo rm -rf  http://imchenkun.com'''
    import scrapy
    from douban.items import DoubanBookItem
    class BookSpider(scrapy.Spider):
        name = 'douban-book'
        allowed_domains = ['douban.com']
        start_urls = [
        def parse(self, response):
            # 请求第一页
            yield scrapy.Request(response.url, callback=self.parse_next)
            # 请求其它页
            for page in response.xpath('//div[@class="paginator"]/a'):
                link = page.xpath('@href').extract()[0]
                yield scrapy.Request(link, callback=self.parse_next)
        def parse_next(self, response):
            for item in response.xpath('//tr[@class="item"]'):
                book = DoubanBookItem()
                book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0]
                book['content'] = item.xpath('td[2]/p/text()').extract()[0]
                book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0]
                yield book


    # -*- coding:utf-8 -*-
    #python2中提示输入为raw_input,python3为input;Python2 print不需要加括号,Python3全加。大家可以根据自己的版本自行优化 import scrapy from faker import Factory from douban.items import DoubanMovieCommentItem import urllib.parse f = Factory.create() class MailSpider(scrapy.Spider): name = 'douban-comment' allowed_domains = ['accounts.douban.com', 'douban.com'] start_urls = [ 'https://www.douban.com/' ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Host': 'accounts.douban.com', 'User-Agent': f.user_agent() } formdata = { 'form_email': '你的邮箱', 'form_password': '你的密码', # 'captcha-solution': '', # 'captcha-id': '', 'login': '登录', 'redir': 'https://www.douban.com/', 'source': 'None' } def start_requests(self): return [scrapy.Request(url='https://www.douban.com/accounts/login', headers=self.headers, meta={'cookiejar': 1}, callback=self.parse_login)] def parse_login(self, response): # 如果有验证码要人为处理 if 'captcha_image' in response.body: print('Copy the link:') link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0] print(link) captcha_solution = input('captcha-solution:') captcha_id = urllib.parse_qs(urllib.urlparse(link).query, True)['id'] self.formdata['captcha-solution'] = captcha_solution self.formdata['captcha-id'] = captcha_id return [scrapy.FormRequest.from_response(response, formdata=self.formdata, headers=self.headers, meta={'cookiejar': response.meta['cookiejar']}, callback=self.after_login )] def after_login(self, response): print (response.status) self.headers['Host'] = "www.douban.com" yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews', meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment_url) yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews', meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_next_page, dont_filter = True) #不去重 def parse_next_page(self, response): print (response.status) try: next_url = response.urljoin(response.xpath('//span[@class="next"]/a/@href').extract()[0]) print ("下一页") print (next_url) yield scrapy.Request(url=next_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment_url, dont_filter = True) yield scrapy.Request(url=next_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_next_page, dont_filter = True) except: print ("Next page Error") return def parse_comment_url(self, response): print (response.status) for item in response.xpath('//div[@class="main review-item"]'): comment_url = item.xpath('header/h3[@class="title"]/a/@href').extract()[0] comment_title = item.xpath('header/h3[@class="title"]/a/text()').extract()[0] print (comment_title) print (comment_url) yield scrapy.Request(url=comment_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment) def parse_comment(self, response): print (response.status) for item in response.xpath('//div[@id="content"]'): comment = DoubanMovieCommentItem() comment['useful_num'] = item.xpath('//div[@class="main-panel-useful"]/button[1]/text()').extract()[0].strip() comment['no_help_num'] = item.xpath('//div[@class="main-panel-useful"]/button[2]/text()').extract()[0].strip() comment['people'] = item.xpath('//span[@property="v:reviewer"]/text()').extract()[0] comment['people_url'] = item.xpath('//header[@class="main-hd"]/a[1]/@href').extract()[0] comment['star'] = item.xpath('//header[@class="main-hd"]/span[1]/@title').extract()[0] data_type = item.xpath('//div[@id="link-report"]/div/@data-original').extract()[0] print ("data_type: "+data_type) if data_type == '0': comment['comment'] = " ##### ".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div/p/text()').extract())) elif data_type == '1': comment['comment'] = " ##### ".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div[1]/text()').extract())) comment['title'] = item.xpath('//span[@property="v:summary"]/text()').extract()[0] comment['comment_page_url'] = response.url #print comment yield comment


    # -*- coding:utf-8 -*-
    '''by sudo rm -rf  http://imchenkun.com'''
    import scrapy
    from faker import Factory
    from douban.items import DoubanMailItem
    import urllib.parse
    f = Factory.create()
    class MailSpider(scrapy.Spider):
        name = 'douban-mail'
        allowed_domains = ['accounts.douban.com', 'douban.com']
        start_urls = [
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Connection': 'keep-alive',
            'Host': 'accounts.douban.com',
            'User-Agent': f.user_agent()
        formdata = {
            'form_email': '你的邮箱',
            'form_password': '你的密码',
            # 'captcha-solution': '',
            # 'captcha-id': '',
            'login': '登录',
            'redir': 'https://www.douban.com/',
            'source': 'None'
        def start_requests(self):
            return [scrapy.Request(url='https://www.douban.com/accounts/login',
                                   meta={'cookiejar': 1},
        def parse_login(self, response):
            # 如果有验证码要人为处理
            if 'captcha_image' in response.body:
                print ('Copy the link:')
                link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0]
                print (link)
                captcha_solution = input('captcha-solution:')
                captcha_id = urllib.parse_qs(urllib.urlparse(link).query, True)['id']
                self.formdata['captcha-solution'] = captcha_solution
                self.formdata['captcha-id'] = captcha_id
            return [scrapy.FormRequest.from_response(response,
                                                     meta={'cookiejar': response.meta['cookiejar']},
        def after_login(self, response):
            print (response.status)
            self.headers['Host'] = "www.douban.com"
            return scrapy.Request(url='https://www.douban.com/doumail/',
                                  meta={'cookiejar': response.meta['cookiejar']},
        def parse_mail(self, response):
            print (response.status)
            for item in response.xpath('//div[@class="doumail-list"]/ul/li'):
                mail = DoubanMailItem()
                mail['sender_time'] = item.xpath('div[2]/div/span[1]/text()').extract()[0]
                mail['sender_from'] = item.xpath('div[2]/div/span[2]/text()').extract()[0]
                mail['url'] = item.xpath('div[2]/p/a/@href').extract()[0]
                mail['title'] = item.xpath('div[2]/p/a/text()').extract()[0]
                print  (mail)
                yield mail




    # -*- coding: utf-8 -*-
    import scrapy
    class DoubanBookItem(scrapy.Item):
        name = scrapy.Field()            # 书名
        price = scrapy.Field()           # 价格
        edition_year = scrapy.Field()    # 出版年份
        publisher = scrapy.Field()       # 出版社
        ratings = scrapy.Field()         # 评分
        author = scrapy.Field()          # 作者
        content = scrapy.Field()
    class DoubanMailItem(scrapy.Item):
        sender_time = scrapy.Field()     # 发送时间
        sender_from = scrapy.Field()     # 发送人
        url = scrapy.Field()             # 豆邮详细地址
        title = scrapy.Field()           # 豆邮标题
    class DoubanMovieCommentItem(scrapy.Item):
        useful_num = scrapy.Field()      # 多少人评论有用
        no_help_num = scrapy.Field()     # 多少人评论无用
        people = scrapy.Field()          # 评论者
        people_url = scrapy.Field()      # 评论者页面
        star = scrapy.Field()            # 评分
        comment = scrapy.Field()         # 评论
        title = scrapy.Field()           # 标题
        comment_page_url = scrapy.Field()# 当前页


    # -*- coding: utf-8 -*-
    class DoubanBookPipeline(object):
        def process_item(self, item, spider):
            info = item['content'].split(' / ')  # [法] 圣埃克苏佩里 / 马振聘 / 人民文学出版社 / 2003-8 / 22.00元
            item['name'] = item['name']
            item['price'] = info[-1]
            item['edition_year'] = info[-2]
            item['publisher'] = info[-3]
            return item
    class DoubanMailPipeline(object):
        def process_item(self, item, spider):
            item['title'] = item['title'].replace(' ', '').replace('\n', '')
            return item
    class DoubanMovieCommentPipeline(object):
        def process_item(self, item, spider):
            return item


    # -*- coding: utf-8 -*-
    # Scrapy settings for douban project
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #     http://doc.scrapy.org/en/latest/topics/settings.html
    #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    BOT_NAME = 'douban'
    SPIDER_MODULES = ['douban.spiders']
    NEWSPIDER_MODULE = 'douban.spiders'
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    from faker import Factory
    f = Factory.create()
    USER_AGENT = f.user_agent()
    # Obey robots.txt rules
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    # Configure a delay for requests for the same website (default: 0)
    # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    # The download delay setting will honor only one of:
    # Disable cookies (enabled by default)
    # Disable Telnet Console (enabled by default)
    # Override the default request headers:
        'Host': 'book.douban.com',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    # Enable or disable spider middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    #    'douban.middlewares.MyCustomSpiderMiddleware': 543,
    # Enable or disable downloader middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    #    'douban.middlewares.MyCustomDownloaderMiddleware': 543,
    # Enable or disable extensions
    # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
    #    'scrapy.extensions.telnet.TelnetConsole': None,
    # Configure item pipelines
    # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
        #'douban.pipelines.DoubanBookPipeline': 300,
        #'douban.pipelines.DoubanMailPipeline': 600,
        'douban.pipelines.DoubanMovieCommentPipeline': 900,
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
    # The initial download delay
    # The maximum download delay to be set in case of high latencies
    # The average number of requests Scrapy should be sending in parallel to
    # each remote server
    # Enable showing throttling stats for every response received:
    # Enable and configure HTTP caching (disabled by default)
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    #HTTPCACHE_DIR = 'httpcache'
    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


    # Automatically created by: scrapy startproject
    # For more information about the [deploy] section see:
    # https://scrapyd.readthedocs.org/en/latest/deploy.html
    default = douban.settings
    #url = http://localhost:6800/
    project = douban

    2.图片例程 douban_imgs




    # This package will contain the spiders of your Scrapy project
    # Please refer to the documentation for information on how to create and manage
    # your spiders.


    # coding=utf-8
    from scrapy.spiders import Spider
    import re
    from scrapy import Request
    from douban_imgs.items import DoubanImgsItem
    class download_douban(Spider):
        name = 'download_douban'
        default_headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate, sdch, br',
            'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'www.douban.com',
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
        def __init__(self, url='1638835355', *args, **kwargs):
            self.allowed_domains = ['douban.com']
            self.start_urls = [
                'http://www.douban.com/photos/album/%s/' % (url)]
            self.url = url
            # call the father base function
            #super(download_douban, self).__init__(*args, **kwargs)
        def start_requests(self):
            for url in self.start_urls:
                yield Request(url=url, headers=self.default_headers, callback=self.parse)
        def parse(self, response):
            list_imgs = response.xpath('//div[@class="photolst clearfix"]//img/@src').extract()
            if list_imgs:
                item = DoubanImgsItem()
                item['image_urls'] = list_imgs
                yield item




    # -*- coding: utf-8 -*-
    # Define here the models for your scraped items
    # See documentation in:
    # http://doc.scrapy.org/en/latest/topics/items.html
    import scrapy
    from scrapy import Item, Field
    class DoubanImgsItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        image_urls = Field()
        images = Field()
        image_paths = Field()


    # -*- coding: utf-8 -*-
    # Define your item pipelines here
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
    from scrapy.pipelines.images import ImagesPipeline
    from scrapy.exceptions import DropItem
    from scrapy import Request
    from scrapy import log
    class DoubanImgsPipeline(object):
        def process_item(self, item, spider):
            return item
    class DoubanImgDownloadPipeline(ImagesPipeline):
        default_headers = {
            'accept': 'image/webp,image/*,*/*;q=0.8',
            'accept-encoding': 'gzip, deflate, sdch, br',
            'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',
            'cookie': 'bid=yQdC/AzTaCw',
            'referer': 'https://www.douban.com/photos/photo/2370443040/',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
        def get_media_requests(self, item, info):
            for image_url in item['image_urls']:
                self.default_headers['referer'] = image_url
                yield Request(image_url, headers=self.default_headers)
        def item_completed(self, results, item, info):
            image_paths = [x['path'] for ok, x in results if ok]
            if not image_paths:
                raise DropItem("Item contains no images")
            item['image_paths'] = image_paths
            return item


    from scrapy import cmdline
    cmd_str = 'scrapy crawl download_douban'
    cmdline.execute(cmd_str.split(' '))


    # -*- coding: utf-8 -*-
    # Scrapy settings for douban_imgs project
    # For simplicity, this file contains only settings considered important or
    # commonly used. You can find more settings consulting the documentation:
    #     http://doc.scrapy.org/en/latest/topics/settings.html
    #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    BOT_NAME = 'douban_imgs'
    SPIDER_MODULES = ['douban_imgs.spiders']
    NEWSPIDER_MODULE = 'douban_imgs.spiders'
    # Crawl responsibly by identifying yourself (and your website) on the user-agent
    # USER_AGENT = 'douban_imgs (+http://www.yourdomain.com)'
    # Configure maximum concurrent requests performed by Scrapy (default: 16)
    # Configure a delay for requests for the same website (default: 0)
    # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
    # See also autothrottle settings and docs
    # The download delay setting will honor only one of:
    # Disable cookies (enabled by default)
    # Disable Telnet Console (enabled by default)
    # Override the default request headers:
    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    #   'Accept-Language': 'en',
    # }
    # Enable or disable spider middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
    #    'douban_imgs.middlewares.MyCustomSpiderMiddleware': 543,
    # }
    # Enable or disable downloader middlewares
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
    #    'douban_imgs.middlewares.MyCustomDownloaderMiddleware': 543,
    # }
    # Enable or disable extensions
    # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
    # EXTENSIONS = {
    #    'scrapy.telnet.TelnetConsole': None,
    # }
    # Configure item pipelines
    # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
        'douban_imgs.pipelines.DoubanImgDownloadPipeline': 300,
    IMAGES_STORE = 'D:\doubanimgs'
    #IMAGES_STORE = '/tmp'
    # Enable and configure the AutoThrottle extension (disabled by default)
    # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
    # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
    # The initial download delay
    # The maximum download delay to be set in case of high latencies
    # Enable showing throttling stats for every response received:
    # Enable and configure HTTP caching (disabled by default)
    # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    # HTTPCACHE_DIR='httpcache'
    # HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'


    # Automatically created by: scrapy startproject
    # For more information about the [deploy] section see:
    # https://scrapyd.readthedocs.org/en/latest/deploy.html
    default = douban_imgs.settings
    #url = http://localhost:6800/
    project = douban_imgs
