zoukankan      html  css  js  c++  java
  • 分布式爬虫

    1. 爬去首页

     1 # -*- coding: utf-8 -*-
     2 from scrapy_redis.spiders import RedisCrawlSpider
     3 import scrapy
     4 import redis
     5 
     6 class DemoSpider(RedisCrawlSpider):
     7     name = 'demo'
     8     allowed_domains = ['demo.com']
     9     redis_key = 'demo:start_urls'
    10     redis_info=redis.Redis(host='111.111.111.111', port=6379)
    11 
    12     def parse(self, response):
    13         sel = scrapy.Selector(response)
    14         a_list = sel.xpath('//div[@id="search_right_demo"]/div/div[@class="clearfixed"]/a')
    15         for a_item in a_list:
    16             url = self.host_urls + a_item.xpath('@href').extract()[0]
    17             url = url[:url.find('=') + 1] + '489' + url[url.find('&'):]
    18             self.redis_info.lpush('demo:list_urls',url)

    2. 爬去URL列表

    # -*- coding: utf-8 -*-
    from scrapy_redis.spiders import RedisCrawlSpiderimport scrapy
    import redis

    class DemoListSpider(RedisCrawlSpider): name = 'demo_list' allowed_domains = ['demo.com'] redis_key = 'demo:list_urls' redis_info=redis.Redis(host='111.111.111.111', port=6379) def parse(self, response): selector = scrapy.Selector(response) table_a_xpath = selector.xpath('//*[@id="newlist_list_content_table"]/table/tr[1]/td[1]/div/a/@href').extract() for url in table_a_xpath: self.redis_info.lpush('demo:info_urls', url) next_page = selector.xpath('//a[@class="next-page"]/@href').extract() if next_page: self.redis_info.lpush('demo:list_urls', next_page[0])

    3. 爬去商品详细信息

    # -*- coding: utf-8 -*-
    from scrapy_redis.spiders import RedisCrawlSpider
    from demo.items import demoItem
    import scrapy
    import redis
    class demoInfoSpider(RedisCrawlSpider):
        name = 'demo_info'
        allowed_domains = ['zhaopin.com']
        redis_key = 'demo:info_urls'
        redis_info=redis.Redis(host='111.111.111.111', port=6379)
        def parse(self, response):
            sel = scrapy.Selector(response)
            zwmc = sel.xpath('//div[@class="top-fixed-box"]/div[@class="fixed-inner-box"]/div[1]/h1/text()').extract()
            gsmc = sel.xpath('//div[@class="top-fixed-box"]/div[@class="fixed-inner-box"]/div[1]/h2/a/text()').extract()
            flxx = sel.xpath('//div[@class="top-fixed-box"]/div[@class="fixed-inner-box"]/div[1]/div/span/text()').extract()
         
         yield item 

  • 相关阅读:
    字符串替换
    字符串查找
    字符串比较
    字节与字符串相互转换
    1365. How Many Numbers Are Smaller Than the Current Number
    1486. XOR Operation in an Array
    1431. Kids With the Greatest Number of Candies
    1470. Shuffle the Array
    1480. Running Sum of 1d Array
    【STM32H7教程】第56章 STM32H7的DMA2D应用之刷色块,位图和Alpha混合
  • 原文地址:https://www.cnblogs.com/liyugeng/p/7865855.html
Copyright © 2011-2022 走看看