zoukankan      html  css  js  c++  java
  • scrapy基本爬虫,采集多页

    # -*- coding: utf-8 -*-
    import csv
    
    import scrapy
    
    
    class GjSpider(scrapy.Spider):
        name = 'gj'
        allowed_domains = ['ganji.com']
        start_urls = ['http://sz.ganji.com/zufang/']
    
    
    
        def parse(self, response):
            houseList = response.xpath('.//div[@class="f-main-list"]/div/div[position()>2]')
            for houst in houseList:
                title = houst.xpath(".//dl/dd[contains(@class,'title')]/a/@title").extract_first()
                size = houst.xpath(".//dl/dd[contains(@class,'size')]/span[3]/text()").extract_first()
                chaoxiang = houst.xpath(".//dl/dd[contains(@class,'size')]/span[5]/text()").extract_first()
                price = houst.xpath(".//dl/dd[contains(@class,'info')]/div/span[1]/text()").extract_first()
                address1 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[1]/text()").extract_first()
                address2 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[2]/span/text()").extract_first()
    
                item = {'title':title,"size":size,"chaoxiang":chaoxiang, "price":price,"address": str(address1)+"-"+str(address2)}
                yield item
    
            next_links = response.xpath('.//div[@class="pageBox"]//a[contains(@class,"next")]/@href').extract()
            if(len(next_links) > 0) :
                next_link = next_links[0]
                print(next_link)
                yield scrapy.Request(next_link,self.parse)
    

      

  • 相关阅读:
    对生产稳定的一些思考
    tsar指标解释
    tsar采集nginx指标
    Nginx如何处理一个连接
    Java : 如何更优雅的设计异常
    MySql的索引实现
    IntelliJ Idea 常用配置
    ICSharpCode.SharpZipLib.dll 压缩、解压Zip文件 附源码
    Java BigDecimal使用
    社交系统中用户好友关系数据库设计
  • 原文地址:https://www.cnblogs.com/brady-wang/p/12505324.html
Copyright © 2011-2022 走看看