zoukankan      html  css  js  c++  java
  • 爬取多个url

    # -*- coding: utf-8 -*-
    import scrapy
    from qiubai.items import QiubaiItem
    
    
    class QiushibaiSpider(scrapy.Spider):
        name = 'qiushibai'
        # allowed_domains = ['www.qiushibaike.com/text/']
        start_urls = ['http://www.qiushibaike.com/text/']
        url = "https://www.qiushibaike.com/text/page/%d/"
        page = 1
        def parse(self, response):
            #    建议大家使用xpath进行指定内容的解析(框架集成了xpath解析的接口)
            #    段子的内容和作者
            div_list = response.xpath('//div[@id="content-left"]/div')
    
            # data_list = []
            for div in div_list:
    
                # xpath解析到的指定内容被存储到了Selector对象
                # extract()该方法可以将Selector对象中存储的数据值拿到
                # author = div.xpath("./div/a[2]/h2/text()").extract()[0]
                # extract_first() == extract()[0]
                author = div.xpath("./div/a[2]/h2/text()").extract_first()
                content = div.xpath('.//div[@class="content"]/span/text()').extract_first()
    
    
                # 将解析到数值的数据存储到item对象
                item = QiubaiItem()
                item["author"] = author
                item["content"] = content
                # 将item对象提交给管道
                yield item
            if self.page <= 13:
                print("正在爬取第%d页" % self.page)
                self.page += 1
                new_url = format(self.url % self.page)
                yield scrapy.Request(url=new_url, callback=self.parse)
    
            #     data_list.append(data)
            # return data_list

    用yield    callback 

  • 相关阅读:
    走亲访友
    分而治之
    红色警报
    小字辈
    最长对称子串
    树的遍历
    acwing练习
    组合计数
    同余
    乘法逆元
  • 原文地址:https://www.cnblogs.com/cjj-zyj/p/10120006.html
Copyright © 2011-2022 走看看