zoukankan      html  css  js  c++  java
  • 分布式爬虫

    1. 爬去首页

     1 # -*- coding: utf-8 -*-
     2 from scrapy_redis.spiders import RedisCrawlSpider
     3 import scrapy
     4 import redis
     5 
     6 class DemoSpider(RedisCrawlSpider):
     7     name = 'demo'
     8     allowed_domains = ['demo.com']
     9     redis_key = 'demo:start_urls'
    10     redis_info=redis.Redis(host='111.111.111.111', port=6379)
    11 
    12     def parse(self, response):
    13         sel = scrapy.Selector(response)
    14         a_list = sel.xpath('//div[@id="search_right_demo"]/div/div[@class="clearfixed"]/a')
    15         for a_item in a_list:
    16             url = self.host_urls + a_item.xpath('@href').extract()[0]
    17             url = url[:url.find('=') + 1] + '489' + url[url.find('&'):]
    18             self.redis_info.lpush('demo:list_urls',url)

    2. 爬去URL列表

    # -*- coding: utf-8 -*-
    from scrapy_redis.spiders import RedisCrawlSpiderimport scrapy
    import redis

    class DemoListSpider(RedisCrawlSpider): name = 'demo_list' allowed_domains = ['demo.com'] redis_key = 'demo:list_urls' redis_info=redis.Redis(host='111.111.111.111', port=6379) def parse(self, response): selector = scrapy.Selector(response) table_a_xpath = selector.xpath('//*[@id="newlist_list_content_table"]/table/tr[1]/td[1]/div/a/@href').extract() for url in table_a_xpath: self.redis_info.lpush('demo:info_urls', url) next_page = selector.xpath('//a[@class="next-page"]/@href').extract() if next_page: self.redis_info.lpush('demo:list_urls', next_page[0])

    3. 爬去商品详细信息

    # -*- coding: utf-8 -*-
    from scrapy_redis.spiders import RedisCrawlSpider
    from demo.items import demoItem
    import scrapy
    import redis
    class demoInfoSpider(RedisCrawlSpider):
        name = 'demo_info'
        allowed_domains = ['zhaopin.com']
        redis_key = 'demo:info_urls'
        redis_info=redis.Redis(host='111.111.111.111', port=6379)
        def parse(self, response):
            sel = scrapy.Selector(response)
            zwmc = sel.xpath('//div[@class="top-fixed-box"]/div[@class="fixed-inner-box"]/div[1]/h1/text()').extract()
            gsmc = sel.xpath('//div[@class="top-fixed-box"]/div[@class="fixed-inner-box"]/div[1]/h2/a/text()').extract()
            flxx = sel.xpath('//div[@class="top-fixed-box"]/div[@class="fixed-inner-box"]/div[1]/div/span/text()').extract()
         
         yield item 

  • 相关阅读:
    详解go语言的array和slice 【一】
    node.js 事件循环
    搭建Docker私有仓库--自签名方式
    详解JavaScript闭包
    [个人翻译]Redis 集群教程(下)
    转:CMake 使用方法
    转: Ogre实现无缝地图要改的地方 记下来 用的时候可以看
    转:ogre的编译及安装
    转:Ogre TerrainGroup地形赏析
    转:如何编译delta3d
  • 原文地址:https://www.cnblogs.com/liyugeng/p/7865855.html
Copyright © 2011-2022 走看看