zoukankan      html  css  js  c++  java
  • Scrapy框架-Spider和CrawlSpider的区别

    1.目标

    http://wz.sun0769.com/index.php/question/questionType?type=4&page=

    爬取每个页面链接的内部内容和投诉信息

    2.方法1:通过Spider爬取

    # -*- coding: utf-8 -*-
    import scrapy
    from dongguanSpider.items import DongguanItem
    
    class SunSpider(scrapy.Spider):
        name = 'sun'
        allowed_domains = ['wz.sun0769.com']
        url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page='
        offset = 0
    
        start_urls = [url + str(offset)]
    
    
        def parse(self, response):
            # 每一页的所有帖子的链接集合
            links = response.xpath('//div[@class="greyframe"]/table//td/a[@class="news14"]/@href').extract()
            # 迭代取出集合里的链接
            for link in links:
                # 提取列表里每个帖子的链接,发送请求并调用parse——item来处理
                yield scrapy.Request(link, callback=self.parse_item)
    
            # 页面终止条件成立前,会一直自增offset的值,并发送新的页面请求,调用parse方法处理
            if self.offset<=71160:
                self.offset +=30
    
                yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
    
    
        def parse_item(self, response):
    
            item = DongguanItem()
    
            item['title'] = response.xpath('//div[@class="wzy1"]/table[1]//tr/td[2]/span[1]/text()').extract()[0].split(':')[-1]
            item['url'] = response.url
            item['number'] = response.xpath('//div[@class="wzy1"]/table[1]//tr/td[2]/span[2]/text()').extract()[0].split(':')[-1]
    
            # 是否是图片
            content_pic = response.xpath('//div[@class="textpic"]/img/@src').extract()
    
            if len(content_pic)==0:
                content_no_pic = response.xpath('//div[@class="wzy1"]/table[2]//tr/td/text()').extract()[0]
                item['content'] = "".join(content_no_pic).replace("xa0", "")
            else:
                item['content'] = "".join(content_pic[0]).replace("xa0", "")
    
            yield item
    

    3. 通过CrawlSpider爬取

    # -*- coding: utf-8 -*-
    import scrapy
    from scrapy.linkextractors import LinkExtractor
    from scrapy.spiders import CrawlSpider, Rule
    from dongguan.items import DongguanItem
    
    
    class SunSpider(CrawlSpider):
        name = 'sun'
        allowed_domains = ['wz.sun0769.com']
        start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=30']
    
        rules = [
            Rule(LinkExtractor(allow=('type=4&page=d+'))),
            Rule(LinkExtractor(allow = ('/html/question/d+/d+.shtml')), callback = 'parseDongguan')
        ]
    
        def parseDongguan(self, response):
    
            item = DongguanItem()
    
    
            item['title'] = response.xpath('//div[@class="wzy1"]/table[1]//tr/td[2]/span[1]/text()').extract()[0].split(':')[-1]
            item['url'] = response.url
            item['number'] = response.xpath('//div[@class="wzy1"]/table[1]//tr/td[2]/span[2]/text()').extract()[0].split(':')[-1]
    
            # 是否是图片
            content_pic = response.xpath('//div[@class="textpic"]/img/@src').extract()
    
            if len(content_pic)==0:
                content_no_pic = response.xpath('//div[@class="wzy1"]/table[2]//tr/td/text()').extract()[0]
                item['content'] = "".join(content_no_pic).replace("xa0", "")
            else:
                item['content'] = "".join(content_pic[0]).replace("xa0", "")
    
            yield item
    
    
  • 相关阅读:
    假期学习01
    构建之法读后感(二)
    构建之法读后感(一)
    每日日报
    每日日报
    每日日报
    每日日报
    每日日报
    每周日报
    每日日报
  • 原文地址:https://www.cnblogs.com/haochen273/p/10386173.html
Copyright © 2011-2022 走看看