zoukankan      html  css  js  c++  java
  • 豆瓣爬虫

    setting.py

    BOT_NAME = 'doubanbook'
    
    SPIDER_MODULES = ['doubanbook.spiders']
    NEWSPIDER_MODULE = 'doubanbook.spiders'
    
    USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'
    FEED_URI = u'file:///E://douban3.csv'
    FEED_FORMAT = 'CSV'

    main.py

    # -*- coding: UTF-8 -*-
    from scrapy import cmdline
    
    cmdline.execute("scrapy crawl dbbook".split())

    items.py

    import scrapy
    
    
    class DoubanbookItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        rate = scrapy.Field()
        author = scrapy.Field()

    dbbook.py

    # -*- coding: utf-8 -*-
    import scrapy
    import re
    from doubanbook.items import DoubanbookItem
    class DbbookSpider(scrapy.Spider):
        name = "dbbook"
        #allowed_domains = ["www.douban.com/doulist/1264675/"]
        start_urls = (
            'https://www.douban.com/doulist/1264675//',
        )
    
        def parse(self, response):
            item = DoubanbookItem()
            selector = scrapy.Selector(response)
            books = selector.xpath('//div[@class="bd doulist-subject"]')
            for each in books:
                title = each.xpath('div[@class="title"]/a/text()').extract()[0]
                rate = each.xpath('div[@class="rating"]/span[@class="rating_nums"]/text()').extract()[0]
                author = re.search(r'<div class="abstract">(.*?)<br',each.extract(),re.S).group(1)
                title = title.replace(' ', '').replace('
    ', '')
                author = author.replace(' ', '').replace('
    ', '')
                item['title'] = title.encode('utf-8')
                item['rate'] =rate
                item['author'] = author.encode('utf-8')
                yield item
                nextp = selector.xpath('//span[@class="next"]/link/@href').extract()
                if nextp:
                    next = nextp[0]
                    print(next)
                    yield scrapy.http.Request(next,callback=self.parse)
  • 相关阅读:
    SD_WebImage-03-多线程+下载任务放入非主线程执行
    NSOperationQueue_管理NSOperation-02-多线程
    CALayer小结-基本使用00-UI进阶
    XMPP-UI进阶-01
    XMPP总结-UI进阶-00
    UI控件总结-UI初级
    转场动画-01-day4
    暂停-开始动画-核心动画-08-day4
    核心动画-04-CALayer隐式动画
    Android开发技术周报 Issue#71
  • 原文地址:https://www.cnblogs.com/Erick-L/p/6739882.html
Copyright © 2011-2022 走看看