zoukankan      html  css  js  c++  java
  • 豆瓣爬虫

    setting.py

    BOT_NAME = 'doubanbook'
    
    SPIDER_MODULES = ['doubanbook.spiders']
    NEWSPIDER_MODULE = 'doubanbook.spiders'
    
    USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'
    FEED_URI = u'file:///E://douban3.csv'
    FEED_FORMAT = 'CSV'

    main.py

    # -*- coding: UTF-8 -*-
    from scrapy import cmdline
    
    cmdline.execute("scrapy crawl dbbook".split())

    items.py

    import scrapy
    
    
    class DoubanbookItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        rate = scrapy.Field()
        author = scrapy.Field()

    dbbook.py

    # -*- coding: utf-8 -*-
    import scrapy
    import re
    from doubanbook.items import DoubanbookItem
    class DbbookSpider(scrapy.Spider):
        name = "dbbook"
        #allowed_domains = ["www.douban.com/doulist/1264675/"]
        start_urls = (
            'https://www.douban.com/doulist/1264675//',
        )
    
        def parse(self, response):
            item = DoubanbookItem()
            selector = scrapy.Selector(response)
            books = selector.xpath('//div[@class="bd doulist-subject"]')
            for each in books:
                title = each.xpath('div[@class="title"]/a/text()').extract()[0]
                rate = each.xpath('div[@class="rating"]/span[@class="rating_nums"]/text()').extract()[0]
                author = re.search(r'<div class="abstract">(.*?)<br',each.extract(),re.S).group(1)
                title = title.replace(' ', '').replace('
    ', '')
                author = author.replace(' ', '').replace('
    ', '')
                item['title'] = title.encode('utf-8')
                item['rate'] =rate
                item['author'] = author.encode('utf-8')
                yield item
                nextp = selector.xpath('//span[@class="next"]/link/@href').extract()
                if nextp:
                    next = nextp[0]
                    print(next)
                    yield scrapy.http.Request(next,callback=self.parse)
  • 相关阅读:
    eclipse快捷键
    ideaIU-2017.3.2版本的免费安装以及2020版本破解
    并发相关问题以及java基础知识
    飞秋软件-局域网内互传
    视频解析网站
    bzoj2458: [BeiJing2011]最小三角形
    bzoj3170: [Tjoi2013]松鼠聚会
    bzoj5056:OI游戏
    dtoj#4224. 小L的占卜
    dtoj#4222. 小b爱旅行(travel)
  • 原文地址:https://www.cnblogs.com/Erick-L/p/6739882.html
Copyright © 2011-2022 走看看