zoukankan      html  css  js  c++  java
  • 豆瓣爬虫

    setting.py

    BOT_NAME = 'doubanbook'
    
    SPIDER_MODULES = ['doubanbook.spiders']
    NEWSPIDER_MODULE = 'doubanbook.spiders'
    
    USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'
    FEED_URI = u'file:///E://douban3.csv'
    FEED_FORMAT = 'CSV'

    main.py

    # -*- coding: UTF-8 -*-
    from scrapy import cmdline
    
    cmdline.execute("scrapy crawl dbbook".split())

    items.py

    import scrapy
    
    
    class DoubanbookItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        rate = scrapy.Field()
        author = scrapy.Field()

    dbbook.py

    # -*- coding: utf-8 -*-
    import scrapy
    import re
    from doubanbook.items import DoubanbookItem
    class DbbookSpider(scrapy.Spider):
        name = "dbbook"
        #allowed_domains = ["www.douban.com/doulist/1264675/"]
        start_urls = (
            'https://www.douban.com/doulist/1264675//',
        )
    
        def parse(self, response):
            item = DoubanbookItem()
            selector = scrapy.Selector(response)
            books = selector.xpath('//div[@class="bd doulist-subject"]')
            for each in books:
                title = each.xpath('div[@class="title"]/a/text()').extract()[0]
                rate = each.xpath('div[@class="rating"]/span[@class="rating_nums"]/text()').extract()[0]
                author = re.search(r'<div class="abstract">(.*?)<br',each.extract(),re.S).group(1)
                title = title.replace(' ', '').replace('
    ', '')
                author = author.replace(' ', '').replace('
    ', '')
                item['title'] = title.encode('utf-8')
                item['rate'] =rate
                item['author'] = author.encode('utf-8')
                yield item
                nextp = selector.xpath('//span[@class="next"]/link/@href').extract()
                if nextp:
                    next = nextp[0]
                    print(next)
                    yield scrapy.http.Request(next,callback=self.parse)
  • 相关阅读:
    洛谷—— P3353 在你窗外闪耀的星星
    洛谷—— P1238 走迷宫
    洛谷—— P1262 间谍网络
    9.8——模拟赛
    洛谷—— P1189 SEARCH
    算法
    May 22nd 2017 Week 21st Monday
    May 21st 2017 Week 21st Sunday
    May 20th 2017 Week 20th Saturday
    May 19th 2017 Week 20th Friday
  • 原文地址:https://www.cnblogs.com/Erick-L/p/6739882.html
Copyright © 2011-2022 走看看