zoukankan      html  css  js  c++  java
  • scrapy基本爬虫,采集多页

    # -*- coding: utf-8 -*-
    import csv
    
    import scrapy
    
    
    class GjSpider(scrapy.Spider):
        name = 'gj'
        allowed_domains = ['ganji.com']
        start_urls = ['http://sz.ganji.com/zufang/']
    
    
    
        def parse(self, response):
            houseList = response.xpath('.//div[@class="f-main-list"]/div/div[position()>2]')
            for houst in houseList:
                title = houst.xpath(".//dl/dd[contains(@class,'title')]/a/@title").extract_first()
                size = houst.xpath(".//dl/dd[contains(@class,'size')]/span[3]/text()").extract_first()
                chaoxiang = houst.xpath(".//dl/dd[contains(@class,'size')]/span[5]/text()").extract_first()
                price = houst.xpath(".//dl/dd[contains(@class,'info')]/div/span[1]/text()").extract_first()
                address1 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[1]/text()").extract_first()
                address2 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[2]/span/text()").extract_first()
    
                item = {'title':title,"size":size,"chaoxiang":chaoxiang, "price":price,"address": str(address1)+"-"+str(address2)}
                yield item
    
            next_links = response.xpath('.//div[@class="pageBox"]//a[contains(@class,"next")]/@href').extract()
            if(len(next_links) > 0) :
                next_link = next_links[0]
                print(next_link)
                yield scrapy.Request(next_link,self.parse)
    

      

  • 相关阅读:
    SQL的update from 理解
    JS自动合并表格
    完全备份ORACLE数据库 并在另一台电脑上恢复
    cmd 连接到指定路径
    oracle 11g 64位安装sqldeveloper打开不了
    oracle 11g卸载方法
    sql的游标使用(转)
    JQEUERY案例
    sessionStorage实现note的功能
    Web Worker模拟抢票
  • 原文地址:https://www.cnblogs.com/brady-wang/p/12505324.html
Copyright © 2011-2022 走看看