zoukankan      html  css  js  c++  java
  • 土巴兔数据爬取

    # -*- coding: utf-8 -*-
    import scrapy
    from tubatu.items import TubatuItem
    
    class TubatuzxSpider(scrapy.Spider):
        name = 'tubatuzx'
        url = 'http://fs.to8to.com/company/list_'
        yeshu = 1
        start_urls = [url + str(yeshu) + '.html']
        # -- http://fs.to8to.com/company/list_4.html --
    
        def parse(self, response):
            ss = TubatuItem()
            quan = response.xpath('//ul[@class="company-data-list"]/li')
            # print(quan[1])
            # print('-------------------------------------')
            # items = []
            for sj in quan:
                ss = TubatuItem()
                name = sj.xpath('./a/div[2]/p[1]/span/text()').extract()[0]
                ss['name'] = name.strip()
            #     # name = sj.xpath('./li/a/div[2]/p[1]/span/text()').extract()
                if len(sj.xpath('./a/div[2]/p[2]/text()').extract()):
                    # dianhua = sj.xpath('./li/a/div[2]/p[2]/text()').extract()
                    ss['dianhua'] = sj.xpath('./a/div[2]/p[2]/text()').extract()[0]
                else:
                    dianhua = ''
                    ss['dianhua'] =' '
    
                # ss['name'] = name[0]
                # ss['dianhua'] = dianhua[0]
                # items.append(ss)
    
                # print(name,dianhua)
                # print(ss)
                yield ss
    
            if self.yeshu < 4:
                self.yeshu += 1
                url = self.url+str(self.yeshu)+'.html'
                print('>>>>>>>>>>>>>>>'+ url + '<<<<<<<<<<')
                yield scrapy.Request(url,callback=self.parse)

    具体代码:https://github.com/mysteriousKiller/tubatu

  • 相关阅读:
    jvm内存模型
    tomcat下出现The file is absent or does not have execute&
    linux下安装apache
    什么时候能踏上征程?追寻真正的自由和信仰
    linux基础命令
    UVALive 7263 Today Is a Rainy Day
    HDU 6071 Lazy Running
    HihoCoder 1634 Puzzle Game
    HDU 6228 Tree
    HDU 2222 Keywords Search AC自动机模版
  • 原文地址:https://www.cnblogs.com/mysterious-killer/p/10136950.html
Copyright © 2011-2022 走看看