zoukankan      html  css  js  c++  java
  • scrapy基本爬虫,采集多页

    # -*- coding: utf-8 -*-
    import csv
    
    import scrapy
    
    
    class GjSpider(scrapy.Spider):
        name = 'gj'
        allowed_domains = ['ganji.com']
        start_urls = ['http://sz.ganji.com/zufang/']
    
    
    
        def parse(self, response):
            houseList = response.xpath('.//div[@class="f-main-list"]/div/div[position()>2]')
            for houst in houseList:
                title = houst.xpath(".//dl/dd[contains(@class,'title')]/a/@title").extract_first()
                size = houst.xpath(".//dl/dd[contains(@class,'size')]/span[3]/text()").extract_first()
                chaoxiang = houst.xpath(".//dl/dd[contains(@class,'size')]/span[5]/text()").extract_first()
                price = houst.xpath(".//dl/dd[contains(@class,'info')]/div/span[1]/text()").extract_first()
                address1 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[1]/text()").extract_first()
                address2 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[2]/span/text()").extract_first()
    
                item = {'title':title,"size":size,"chaoxiang":chaoxiang, "price":price,"address": str(address1)+"-"+str(address2)}
                yield item
    
            next_links = response.xpath('.//div[@class="pageBox"]//a[contains(@class,"next")]/@href').extract()
            if(len(next_links) > 0) :
                next_link = next_links[0]
                print(next_link)
                yield scrapy.Request(next_link,self.parse)
    

      

  • 相关阅读:
    求助
    debian虚拟机试用
    算是业界新闻吧
    推荐一个图书分享网站
    日志
    写汇编报告小记
    水仙花数
    进程创建
    win32程序运行原理1
    [解题报告]374 Big Mod
  • 原文地址:https://www.cnblogs.com/brady-wang/p/12505324.html
Copyright © 2011-2022 走看看