zoukankan      html  css  js  c++  java
  • scrapy框架爬取链家旗下贝壳找房资源

    1、首先定义容器,爬取资源的字段,items.py文件

    class CDErshouFang(scrapy.Item):
        "贝壳二手房"
        house_name = scrapy.Field() #小区名称
        house_address = scrapy.Field() #小区地址
        house_info = scrapy.Field() # 房子信息:楼层、建造时间、户型、建造面积、朝向
        release_time = scrapy.Field() # 发布时间
        house_tags = scrapy.Field() # 标签
        price = scrapy.Field() # 均价
        total_price = scrapy.Field() # 总价
        details = scrapy.Field() #详情页-所在区域
        trading_ownership = scrapy.Field() #交易权属
        commodity_use = scrapy.Field() #商品用途
        house_year = scrapy.Field() #房屋年限
        property = scrapy.Field() #产权所属
        mortgage_information = scrapy.Field() #抵押信息
        room_spare = scrapy.Field() #房本备件

    2、编写爬虫文件,解析下载器下载的网页资源,spiders目录下的爬虫文件。

     1 import scrapy
     2 from scrapystudy.items import CDErshouFang
     3 
     4 
     5 class CdErshoufangSpider(scrapy.Spider):
     6     name = 'cd_ershoufang'
     7     allowed_domains = ['cd.ke.com']
     8     start_urls = ['https://cd.ke.com/ershoufang/']
     9 
    10     def start_requests(self):
    11         "重写start_requests,爬虫将从此处运行url"
    12 
    13         for page in range(1,100):
    14             url = self.start_urls[0] + 'pg' + str(page) + '/'
    15             yield scrapy.Request(url=url,callback=self.parse,dont_filter=True)
    16 
    17     def parse(self, response):
    18         SET_SELECT = response.css('.info') #*****,此处定位不正确关系下面的信息读取
    19         for cle in SET_SELECT:
    20             item = CDErshouFang()
    21             house_name = cle.css('.title a::text').extract_first() #用的是cle选择器(下载器下载的网页,不能直接切换操作)
    22             house_address = cle.css('.positionInfo a::text').extract_first()
    23             house_info = cle.css('.houseInfo::text').extract()[1].replace(' ','').replace('
    ','')
    24             release_time = cle.css('.followInfo::text').extract()[1].replace(' ','').replace('
    ','')
    25             price_total = cle.css('.priceInfo .totalPrice span::text').extract_first()
    26             if price_total is not None:
    27                 price_total = price_total + ''
    28             price = cle.css('.unitPrice span::text').extract_first()
    29             # house_tags = cle.css('.info .address .tag span::text').extract()
    30             item["house_name"] = house_name
    31             item["house_address"] = house_address
    32             item["house_info"] = house_info
    33             item["release_time"] = release_time
    34             item["total_price"] = price_total
    35             item["price"] = price
    36             # item["house_tags"] = house_tags
    37             details_page_url = cle.css('.title a::attr(href)').extract_first() #详情页超链接
    38             # meta:把需要传递的信息赋值给这个叫meta的变量(字典类型),Request中meta参数的作用是传递信息给下一个函数
    39             yield scrapy.Request(url=details_page_url,callback=self.details,meta={'item':item})
    40 
    41     def details(self,response):
    42         "详情页数据获取"
    43         area = response.xpath('//span[@class="info"]/a[1]/text()').extract_first() #
    44         details = response.xpath('//span[@class="info"]/a[last()]/text()').extract_first()
    45         if area is not None or details is not None:
    46             details = area + ' ' + details
    47         trading_ownership = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[2]/text()').extract_first().strip() # 交易权属
    48         commodity_use = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[4]/text()').extract_first().strip() # 房屋用途
    49         house_year = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[5]/text()').extract_first().strip() # 房屋年限
    50         property = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[6]/text()').extract_first().strip() # 产权所属
    51         mortgage_information = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[7]/span[2]/text()').extract_first().strip() # 抵押信息
    52         room_spare = response.xpath('//div[@class="transaction"]/div[@class="content"]/ul/li[8]/text()').extract_first().strip() # 房本备件
    53         item = response.meta['item'] #取出上一个页面爬取的信息
    54         item["details"] = details
    55         item["trading_ownership"] = trading_ownership
    56         item["commodity_use"] = commodity_use
    57         item["house_year"] = house_year
    58         item["property"] = property
    59         item["mortgage_information"] = mortgage_information
    60         item["room_spare"] = room_spare
    61         yield item

    3、处理爬取的数据,pipelines.py文件编写数据存储在MongoDB数据库或者yaml、json文件

      1 # Define your item pipelines here
      2 #
      3 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
      4 # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
      5 
      6 
      7 # useful for handling different item types with a single interface
      8 from itemadapter import ItemAdapter
      9 from scrapy.exceptions import DropItem
     10 import pymongo
     11 
     12 class TextPipeline:
     13     "要是用pipelines要先注册,去setting.py添加配置"
     14 
     15     def __init__(self):
     16         self.limit = 50
     17 
     18     def process_item(self, item, spider):
     19         "处理items长度"
     20         if item['title']:
     21             if len(item['title'])>self.limit:
     22                 item['title'] = item['title'][0:self.limit].rstrip()+'...'
     23             return item
     24         else:
     25             return DropItem('Missing Text')
     26 
     27 class MongoPipeline(object):
     28     "将数据存储在MongoDB中"
     29 
     30     def __init__(self,mongo_url,mongo_db):
     31         self.mongo_url = mongo_url
     32         self.mongo_db = mongo_db
     33 
     34     @classmethod
     35     def from_crawler(cls,crawler):
     36         "获取到setting全局的配置"
     37         return cls(
     38             mongo_url = crawler.settings.get('MONGO_URL'),
     39             mongo_db = crawler.settings.get('MONGO_DB')
     40         )
     41 
     42     def open_spider(self,spider):
     43         self.client = pymongo.MongoClient(self.mongo_url)
     44         self.db = self.client[self.mongo_db]
     45 
     46     def process_item(self,item,spider):
     47         name = item.__class__.__name__
     48         self.db[name].insert(dict(item))
     49         return item
     50 
     51     def close_spider(self,spider):
     52         self.client.close()
     53 
     54 import os
     55 import time
     56 import logging
     57 import yaml
     58 logger = logging.getLogger(__name__)
     59 
     60 class SaveBeikePipeline(object):
     61     "保存爬虫数据到yaml、json文件"
     62 
     63     def open_spider(self,spider):
     64         "spider打开时启动,该方法被调用,这里打开或新建一个文件"
     65         filetime = time.strftime("%Y%m%d")
     66         filepath = os.path.join(os.path.dirname(__file__),'spiderSaveFile')
     67         if not os.path.exists(filepath) : os.mkdir(filepath)
     68         # spider_file = filepath + '{}.yaml' .format(self.__class__.__name__) #self.__class__.__name__获取类名
     69         spider_file = filepath + '{}.yaml' .format(filetime)
     70         try:
     71             self.f = open(spider_file, mode='w', encoding='utf-8')
     72         except Exception as e:
     73             logger.error(e)
     74 
     75     def process_item(self,item,spider):
     76         "处理数据"
     77         data = dict()
     78         data["小区名称"] = item["house_name"]
     79         data["在售状态"] =  item["on_sale"]
     80         data["房屋类型"] = item["house_type"]
     81         data["小区地址"] = item["address"]
     82         data["房屋户型"] = item["door_module"]
     83         data["建筑面积"] = item["area"]
     84         data["价格"] = item["price"]
     85         data["总价/套"] = item["total_price"]
     86         data["附近设施"] = item["tags"]
     87         # self.f.write(str(data)+'
    ')
     88         spider_data = yaml.dump(data,allow_unicode=True,width=1000,sort_keys=False) # sort_keys:表示dump后的字典数据按原有的顺序示
     89         self.f.write(spider_data+'*'.center(50,'-')+'
    ')
     90         return item
     91 
     92     def close_spider(self,spider):
     93         "scrapy结束时启动,用来关掉文件"
     94         self.f.close()
     95 
     96 class SaveCDershouFangPipeline(object):
     97     "保存爬虫数据到yaml"
     98 
     99     def open_spider(self,spider):
    100         "spider打开时启动,该方法被调用,这里打开或新建一个文件"
    101         filetime = time.strftime("%Y%m%d")
    102         filepath = os.path.join(os.path.dirname(__file__),'spiderSaveFile')
    103         if not os.path.exists(filepath) : os.mkdir(filepath)
    104         # spider_file = filepath + '{}.yaml' .format(self.__class__.__name__) #self.__class__.__name__获取类名
    105         spider_file = filepath + 'cd_ershoufang{}.yaml' .format(filetime)
    106         try:
    107             self.f = open(spider_file, mode='w', encoding='utf-8')
    108         except Exception as e:
    109             logger.error(e)
    110 
    111     def process_item(self,item,spider):
    112         "处理数据"
    113         data = dict()
    114         data["小区名称"] = item["house_name"]
    115         data["小区地址"] =  item["house_address"]
    116         data["房子信息"] = item["house_info"]
    117         data["发布时间"] = item["release_time"]
    118         data["总价/套"] = item["total_price"]
    119         data["均价"] = item["price"]
    120         # data["标签"] = item["house_tags"]
    121         data["所在区域"] = item["details"]
    122         data["交易权属"] = item["trading_ownership"]
    123         data["房屋用途"] = item["commodity_use"]
    124         data["房屋年限"] = item["house_year"]
    125         data["产权所属"] = item["property"]
    126         data["抵押信息"] = item["mortgage_information"]
    127         data["房本备件"] = item["room_spare"]
    128         spider_data = yaml.dump(data,allow_unicode=True,width=1000,sort_keys=False) # sort_keys:表示dump后的字典数据按原有的顺序示,默认为True
    129         self.f.write(spider_data+'*'.center(60,'-')+'
    ')
    130         return item
    131 
    132     def close_spider(self,spider):
    133         "scrapy结束时启动,用来关掉文件"
    134         self.f.close()

    4、部分页面分页无法获取网页,需要与selenium结合起来,将selenium操作的资源封装成response资源并发送给spiders进行解析,middlewares.py

      1 # Define here the models for your spider middleware
      2 #
      3 # See documentation in:
      4 # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
      5 
      6 from scrapy import signals
      7 
      8 # useful for handling different item types with a single interface
      9 from itemadapter import is_item, ItemAdapter
     10 
     11 
     12 class ScrapystudySpiderMiddleware:
     13     # Not all methods need to be defined. If a method is not defined,
     14     # scrapy acts as if the spider middleware does not modify the
     15     # passed objects.
     16 
     17     @classmethod
     18     def from_crawler(cls, crawler):
     19         # This method is used by Scrapy to create your spiders.
     20         s = cls()
     21         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
     22         return s
     23 
     24     def process_spider_input(self, response, spider):
     25         # Called for each response that goes through the spider
     26         # middleware and into the spider.
     27 
     28         # Should return None or raise an exception.
     29         return None
     30 
     31     def process_spider_output(self, response, result, spider):
     32         # Called with the results returned from the Spider, after
     33         # it has processed the response.
     34 
     35         # Must return an iterable of Request, or item objects.
     36         for i in result:
     37             yield i
     38 
     39     def process_spider_exception(self, response, exception, spider):
     40         # Called when a spider or process_spider_input() method
     41         # (from other spider middleware) raises an exception.
     42 
     43         # Should return either None or an iterable of Request or item objects.
     44         pass
     45 
     46     def process_start_requests(self, start_requests, spider):
     47         # Called with the start requests of the spider, and works
     48         # similarly to the process_spider_output() method, except
     49         # that it doesn’t have a response associated.
     50 
     51         # Must return only requests (not items).
     52         for r in start_requests:
     53             yield r
     54 
     55     def spider_opened(self, spider):
     56         spider.logger.info('Spider opened: %s' % spider.name)
     57 
     58 
     59 class ScrapystudyDownloaderMiddleware:
     60     # Not all methods need to be defined. If a method is not defined,
     61     # scrapy acts as if the downloader middleware does not modify the
     62     # passed objects.
     63 
     64     @classmethod
     65     def from_crawler(cls, crawler):
     66         # This method is used by Scrapy to create your spiders.
     67         s = cls()
     68         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
     69         return s
     70 
     71     def process_request(self, request, spider):
     72         # Called for each request that goes through the downloader
     73         # middleware.
     74         # request.cookie = {
     75         #     "Cookie":"__mta=108386109.1609123577452.1610351007435.1610351353409.13; __mta=108386109.1609123577452.1610351353409.1610362706394.14; uuid_n_v=v1; _lxsdk_cuid=176a73d3e42c8-057a36937583e8-c791039-149c48-176a73d3e42c8; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; uuid=DF86446053FA11EBBFA05D0E1C80A5E52BD1299115184C8C837F6324366BFFA0; _csrf=1d012800348e02304158b04bcaacdb15959e3482e6847893721b340ca6f29323; lt=8kvWp1o5sQYEgkrZTHbti6H0uI8AAAAAhgwAADxF8ufwXVyR4TU3_BGMHAKsB_TA6toYFjxg-m34Z43vNJlCb9Bv05PqTeelhSHITw; lt.sig=iPSGNXFnd3jV3SEy7wzqa0L_QOw; uid=2829236546; uid.sig=fiHM__7YgLUMEaZ05TkEQaVApbs; _lxsdk=DF86446053FA11EBBFA05D0E1C80A5E52BD1299115184C8C837F6324366BFFA0; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1609123577,1609148969,1610350992,1610362253; __mta=108386109.1609123577452.1610362628562.1610362689900.15; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1610362706; _lxsdk_s=176f0edcffa-620-f33-c24%7C%7C53",
     76         #     "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
     77         # }
     78 
     79         # Must either:
     80         # - return None: continue processing this request
     81         # - or return a Response object
     82         # - or return a Request object
     83         # - or raise IgnoreRequest: process_exception() methods of
     84         #   installed downloader middleware will be called
     85         return None
     86 
     87     def process_response(self, request, response, spider):
     88         # Called with the response returned from the downloader.
     89 
     90         # Must either;
     91         # - return a Response object
     92         # - return a Request object
     93         # - or raise IgnoreRequest
     94         return response
     95 
     96     def process_exception(self, request, exception, spider):
     97         # Called when a download handler or a process_request()
     98         # (from other downloader middleware) raises an exception.
     99 
    100         # Must either:
    101         # - return None: continue processing this exception
    102         # - return a Response object: stops process_exception() chain
    103         # - return a Request object: stops process_exception() chain
    104         pass
    105 
    106     def spider_opened(self, spider):
    107         spider.logger.info('Spider opened: %s' % spider.name)
    108 
    109 # import logging
    110 # class ProxyMiddleware(object):
    111 #     "设置中间件代理"
    112 #     logger = logging.getLogger(__name__)
    113 #     def process_request(self,request,spider):
    114 #         self.logger.debug("Using Proxy")
    115 #         request.meta["proxy"] = "http://125.87.105.4:49713"
    116 
    117 from selenium import webdriver
    118 from selenium.webdriver.common.by import By
    119 from selenium.webdriver.support.ui import WebDriverWait
    120 from selenium.webdriver.support import expected_conditions as EC
    121 from selenium.common.exceptions import TimeoutException
    122 from scrapy.http import HtmlResponse
    123 from selenium.webdriver.chrome.options import Options
    124 import logging
    125 import time
    126 
    127 logger = logging.getLogger(__name__)
    128 
    129 class SeleniumMiddleware(object):  #??如何将多个HtnlResponse对象传给spider进行解析????
    130 
    131     def process_request(self,request,spider):
    132         url = request.url
    133         opt = Options()
    134         opt.add_argument('--headless')
    135         # 创建谷歌浏览器对象
    136         browser = webdriver.Chrome()
    137         wait = WebDriverWait(browser,10)
    138         browser.get(url)
    139         htmls = []
    140         for page in range(2,3):
    141             try:
    142                 next_page = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,"body > div.page-container.clearfix > div.page-box > a.next")))
    143                 next_page.click()
    144                 # 判断当前页码是否为当前页
    145                 wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,"body > div.page-container.clearfix > div.page-box > span.active"),str(page)))
    146             except TimeoutException:
    147                 continue
    148             browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
    149             time.sleep(2)
    150         html = browser.page_source  # 返回网页源码
    151         logger.info("获取到的URL:"+request.url)
    152         # browser.quit()
    153         return HtmlResponse(url=request.url,body=html,request=request,encoding='utf-8')

    5、激活pipelines项目管道和middlewares中间件,setting.py

     1 # Scrapy settings for scrapystudy project
     2 #
     3 # For simplicity, this file contains only settings considered important or
     4 # commonly used. You can find more settings consulting the documentation:
     5 #
     6 #     https://docs.scrapy.org/en/latest/topics/settings.html
     7 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
     8 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
     9 
    10 BOT_NAME = 'scrapystudy'
    11 
    12 SPIDER_MODULES = ['scrapystudy.spiders']
    13 NEWSPIDER_MODULE = 'scrapystudy.spiders'
    14 
    15 MONGO_URL = "localhost"
    16 MONGO_DB = "mydb"
    17 
    18 
    19 # Crawl responsibly by identifying yourself (and your website) on the user-agent
    20 #USER_AGENT = 'scrapystudy (+http://www.yourdomain.com)'
    21 
    22 # Obey robots.txt rules
    23 ROBOTSTXT_OBEY = False
    24 
    25 # Configure maximum concurrent requests performed by Scrapy (default: 16)
    26 #CONCURRENT_REQUESTS = 32
    27 
    28 # Configure a delay for requests for the same website (default: 0)
    29 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
    30 # See also autothrottle settings and docs
    31 #DOWNLOAD_DELAY = 3
    32 # The download delay setting will honor only one of:
    33 #CONCURRENT_REQUESTS_PER_DOMAIN = 16
    34 #CONCURRENT_REQUESTS_PER_IP = 16
    35 
    36 # Disable cookies (enabled by default)
    37 # COOKIES_ENABLED = True
    38 
    39 # Disable Telnet Console (enabled by default)
    40 #TELNETCONSOLE_ENABLED = False
    41 
    42 # Override the default request headers:
    43 # DEFAULT_REQUEST_HEADERS = {
    44 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    45 #   'Accept-Language': 'en',
    46 #   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
    47 # }
    48 
    49 # Enable or disable spider middlewares
    50 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
    51 # SPIDER_MIDDLEWARES = {
    52 #    'scrapystudy.middlewares.MyFirstSpiderMiddleware': 543,
    53 # }
    54 
    55 # Enable or disable downloader middlewares
    56 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
    57 # DOWNLOADER_MIDDLEWARES = {
    58 #    'scrapystudy.middlewares.SeleniumMiddleware': 543,
    59 # }
    60 
    61 # Enable or disable extensions
    62 # See https://docs.scrapy.org/en/latest/topics/extensions.html
    63 #EXTENSIONS = {
    64 #    'scrapy.extensions.telnet.TelnetConsole': None,
    65 #}
    66 
    67 # Configure item pipelines
    68 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    69 # 300,400表示执行顺序,越小代表优先级越高,越先执行
    70 ITEM_PIPELINES = {
    71    'scrapystudy.pipelines.SaveCDershouFangPipeline': 600,
    72    # 'scrapystudy.pipelines.TextPipeline': 300,
    73    # 'scrapystudy.pipelines.MongoPipeline': 400,
    74    # 'scrapystudy.pipelines.SaveBeikePipeline': 500,
    75 }
    76 
    77 # Enable and configure the AutoThrottle extension (disabled by default)
    78 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
    79 #AUTOTHROTTLE_ENABLED = True
    80 # The initial download delay
    81 #AUTOTHROTTLE_START_DELAY = 5
    82 # The maximum download delay to be set in case of high latencies
    83 #AUTOTHROTTLE_MAX_DELAY = 60
    84 # The average number of requests Scrapy should be sending in parallel to
    85 # each remote server
    86 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
    87 # Enable showing throttling stats for every response received:
    88 #AUTOTHROTTLE_DEBUG = False
    89 
    90 # Enable and configure HTTP caching (disabled by default)
    91 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
    92 #HTTPCACHE_ENABLED = True
    93 #HTTPCACHE_EXPIRATION_SECS = 0
    94 #HTTPCACHE_DIR = 'httpcache'
    95 #HTTPCACHE_IGNORE_HTTP_CODES = []
    96 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
  • 相关阅读:
    【后端】Python学习笔记
    【学习】JennyHui学英语
    【学习】JennyHui学英语
    【英语】Bingo口语笔记(3)
    LoadRunner目录分析
    性能测试常见用语
    [转]黑盒测试用例设计方法
    RUP
    软件质量管理杂谈
    关于BUG
  • 原文地址:https://www.cnblogs.com/yzmPython/p/14371529.html
Copyright © 2011-2022 走看看