zoukankan      html  css  js  c++  java
  • 豆瓣爬虫

    setting.py

    BOT_NAME = 'doubanbook'
    
    SPIDER_MODULES = ['doubanbook.spiders']
    NEWSPIDER_MODULE = 'doubanbook.spiders'
    
    USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'
    FEED_URI = u'file:///E://douban3.csv'
    FEED_FORMAT = 'CSV'

    main.py

    # -*- coding: UTF-8 -*-
    from scrapy import cmdline
    
    cmdline.execute("scrapy crawl dbbook".split())

    items.py

    import scrapy
    
    
    class DoubanbookItem(scrapy.Item):
        # define the fields for your item here like:
        # name = scrapy.Field()
        title = scrapy.Field()
        rate = scrapy.Field()
        author = scrapy.Field()

    dbbook.py

    # -*- coding: utf-8 -*-
    import scrapy
    import re
    from doubanbook.items import DoubanbookItem
    class DbbookSpider(scrapy.Spider):
        name = "dbbook"
        #allowed_domains = ["www.douban.com/doulist/1264675/"]
        start_urls = (
            'https://www.douban.com/doulist/1264675//',
        )
    
        def parse(self, response):
            item = DoubanbookItem()
            selector = scrapy.Selector(response)
            books = selector.xpath('//div[@class="bd doulist-subject"]')
            for each in books:
                title = each.xpath('div[@class="title"]/a/text()').extract()[0]
                rate = each.xpath('div[@class="rating"]/span[@class="rating_nums"]/text()').extract()[0]
                author = re.search(r'<div class="abstract">(.*?)<br',each.extract(),re.S).group(1)
                title = title.replace(' ', '').replace('
    ', '')
                author = author.replace(' ', '').replace('
    ', '')
                item['title'] = title.encode('utf-8')
                item['rate'] =rate
                item['author'] = author.encode('utf-8')
                yield item
                nextp = selector.xpath('//span[@class="next"]/link/@href').extract()
                if nextp:
                    next = nextp[0]
                    print(next)
                    yield scrapy.http.Request(next,callback=self.parse)
  • 相关阅读:
    leetcode 47 Permutations II ----- java
    leetcode 46 Permutations ----- java
    leetcode 45 Jump Game II ---- java
    leetcode 44 Wildcard Matching ----java
    leetcode 43 Multiply Strings ----java
    leetcode 42 Trapping Rain Water ---java
    leetcode 41 First Missing Positive ---java
    leetcode 40 Combination Sum II --- java
    leetcode 39 Combination Sum --- java
    java 上下文切换
  • 原文地址:https://www.cnblogs.com/Erick-L/p/6739882.html
Copyright © 2011-2022 走看看