1.scrapy工作原理
1.1组件介绍
调度器(Scheduler):
调度器实际上就是一个存取这待爬取URL的优先级队列,该队列集成过滤器及URL去重等功能
引擎(Scrapy Engine):
整个scrapy框架的核心,用于各组件间的协调及通信。
下载器(Downloader):
用于抓取和下载网页内容,并将网页返回给爬虫(建立在twisted之上)。
管道(Pipeline):
负责处理爬虫从网页中抽出的实体,主要功能是实现实体(item)的持久化、验证实体的有效性,清除不需要的信息。当爬虫被解析后,将被发送到项目管道,并经过几个特定的次序处理数据。
爬虫(spider):
提取所需的网页数据,并结构化成实体(item)。
1.2 中间件
下载器中间件(Download Middleware):
这是scrapy诸多中间件中最重要的中间件,其位于引擎和下载器之间,支持多个中间件串行运行。当引擎传递下载任务请求过程中,http请求前(
process_request
),下载器中间件可以对请求进行处理,例如设置动态IP代理、更改UserAgent、增加或定义相关header信息,http请求后,传递响应到引擎的过程中(process_response
),下载器中间件可以对响应进行处理(例如gzip的解压等等)爬虫中间件(Spider Middlewares):
该中间件是介于引擎与爬虫之间的组件,我们可以自定义功能来处理发送给Spider的
response
以及spider产生的item
和request
具体使用参考https://www.jianshu.com/p/4d8862522fa7
1.3 数据流传递过程
Scrapy中的数据流由执行引擎控制,其过程如下:
- 引擎从Spiders中获取到的最初的要爬取的请求(Requests)。
- 引擎安排请求(Requests)到调度器中,并向调度器请求下一个要爬取的请求(Requests)。
- 调度器返回下一个要爬取的请求(Request)给请求。
- 引擎从上步中得到的请求(Requests)通过下载器中间件(Downloader Middlewares)发送给下载器(Downloader),这个过程中下载器中间件(Downloader Middlerwares)中的
process_request()
函数就会被调用。 - 一旦页面下载完毕,下载器生成一个该页面的Response,并将其通过下载中间件(Downloader Middlewares)中的
process_response()
函数,最后返回给引擎 - 引擎从下载器中得到上步中的Response并通过Spider中间件(Spider Middewares)发送给Spider处理,这个过程中Spider中间件(Spider Middlewares)中的
process_spider_input()
函数会被调用到。 - Spider处理Response并通过Spider中间件(Spider Middlewares)返回爬取到的Item及(跟进的)新的Request给引擎,这个过程中Spider中间件(Spider Middlewares)的
process_spider_output()
函数会被调用到。 - 引擎将上步中Spider处理的及其爬取到的Item给Item管道(Piplline),将Spider处理的Requests发送给调度器,并向调度器请求可能存在的下一个要爬取的请求(Requests)
- (从第二步)重复知道调度器中没有更多的请求(Requests)。
2. 房天下整站新房及二手房数据抓取
2.1项目目录结构
fangtianxia
│ items.py
│ middlewares.py
│ pipelines.py
│ settings.py
│ utils.py
│ init.py
│
├─spiders
│ │ ftx.py
│ │ init.py
2.2 核心代码
ftx.py ---爬虫程序,解析新房及二手房信息网页数据
# -*- coding: utf-8 -*-
import scrapy
from ..utils import GenCityData
import re
from ..items import NewHouseItem
from ..items import SecondHandHouseItem
from urllib import parse
class FtxSpider(scrapy.Spider):
name = 'ftx'
allowed_domains = ['fangtianxia.com']
start_urls = ['https://www.fang.com/SoufunFamily.htm',]
def parse(self, response):
id_no = 1
id_prefix = "sffamily_B03_{0}"
while 1:
cur_no = id_no if id_no >= 10 else '0' + str(id_no)
cur_basic_xpath = "//tr[@id='" + id_prefix.format(cur_no) + "']"
res = response.xpath(cur_basic_xpath)
if not len(res):
break
else:
g = GenCityData(res)
for region_name, city_name, newhouse_link, oldhouse_link in g.data():
print(region_name, city_name, newhouse_link, oldhouse_link)
yield scrapy.Request(
url=newhouse_link,
callback=self.parse_newhouse,
meta={'info': (region_name, city_name)},
dont_filter=True,
)
yield scrapy.Request(
url=oldhouse_link,
callback=self.parse_oldhouse,
meta={'info': (region_name, city_name)},
dont_filter=True,
)
id_no += 1
def parse_newhouse(self, response):
region_name, city_name = response.meta.get('info')
house_items = response.xpath("//li//div[contains(@class, 'nlc_details')]")
for house in house_items:
format_func = lambda regex, unformate_str, join_tag: re.sub(regex, '', join_tag.join(unformate_str))
# 小区(楼盘名)
unformate_name = house.xpath(".//div[contains(@class, 'nlcd_name')]/a/text()").get(),
house_name = format_func('s', unformate_name, '')
# 居室类型
house_type = list(house.xpath("./div[contains(@class, 'house_type')]/a/text()").getall())
house_type = '|'.join(house_type)
# 建面
unformate_area = house.xpath("./div[contains(@class, 'house_type')]/text()").getall()
area = format_func('s|/|-', unformate_area, '')
# 地址
unformate_addr = house.xpath(".//div[contains(@class, 'address')]//text()").getall()
address = format_func('s', unformate_addr, '')
# 价格
unformate_price = house.xpath("./div[@class='nhouse_price']//text()").getall()
price = format_func('s|广告', unformate_price, '')
# 联系电话
unformate_tel = house.xpath(".//div[@class='tel']/p/text()").getall()
mobile = unformate_tel[0] if all(unformate_tel) else ""
# 更多信息页
detail_link = house.xpath(".//div[contains(@class, 'nlcd_name')]/a/@href").get(),
detail_link = 'https:'+''.join(list(detail_link))
# 状态 在售或待售
status = house.xpath(".//span[@class='inSale']/text()").get()
# 标签
tags = house.xpath(".//div[contains(@class,'fangyuan')]/a/text()").getall()
tags = format_func('s', tags, '|')
yield NewHouseItem(
house_name = house_name,
house_type = house_type,
area = area,
address = address,
detail_link = detail_link,
price = price,
mobile = mobile,
status = status,
tags = tags,
region_name = region_name,
city_name = city_name
)
next_page = response.xpath("//div[@class='page']//a[@class='next']/@href").get()
if next_page:
yield scrapy.Request(
url = next_page,
callback = self.parse_newhouse,
meta = {'info':(region_name, city_name)},
dont_filter = True
)
def parse_oldhouse(self, response):
region_name, city_name = response.meta.get('info')
house_items = response.xpath("//div[contains(@class,'shop_list')]//dl[@id]")
for house in house_items:
# 小区名
house_name = house.xpath(".//p[@class='add_shop']/a/@title").get()
# 标题
title = house.xpath("./dd//span[@class='tit_shop']/text()").get()
detail_list = house.xpath(".//p[contains(@class,'tel_shop')]/text()").getall()
detail_list = list(map(lambda x: x.strip(), detail_list))
# 类型、建面、楼层类型、楼层朝向、修建日期
house_type, area, floor, direction, *_ = detail_list
# 房东姓名
house_master = house.xpath(".//span[contains(@class,'people_name')]/a/text()").get()
# 总价
total_price = house.xpath("./dd[@class='price_right']/span/b/text()").get()
# 单价
unit_price = house.xpath("./dd[@class='price_right']/span//text()").getall()[-1]
# 地址
address = house.xpath(".//p[@class='add_shop']/span/text()").get()
# print(house_name, title, house_type, area, floor, direction, house_master, total_price, unit_price, address)
yield SecondHandHouseItem(
title = title,
house_type = house_type,
area = area,
floor = floor,
direction = direction,
house_master = house_master,
detail_addr = address,
total_price = total_price,
unit_price = unit_price,
region_name = region_name,
city_name = city_name,
house_name = house_name,
)
next = response.xpath("//div[@class='page_al']//p/a[text()='下一页']")
if bool(next):
next_url = next.xpath("./@href").extract()[0]
# print(response.urljoin(next_url))
yield scrapy.Request(
url=response.urljoin(next_url),
callback=self.parse_oldhouse,
dont_filter=True,
meta={'info':(region_name, city_name)},
)
utils.py ----生成每一个地区的新房及二手房URL
"""
该模块主要提供工具类
"""
import threading
Lock = threading.Lock()
class GenCityData(object):
"""提取首页的城市连接"""
def __new__(cls, *args, **kwargs):
with Lock:
if hasattr(cls, '_instance'):
return cls._instance
setattr(cls, '_instance', object.__new__(cls))
return cls._instance
def __init__(self, res):
self.res = res
def _is_valid(self):
"""特别行政区的id与部分省份相同,处理差错"""
# 排除 特殊空格字符
region_name_list = list(
filter(lambda x: len(x.get().strip()), self.res.xpath(".//strong/text()"))
)
return True if len(region_name_list) == 2 else False
def _region_format(self):
if self._is_valid():
*region_eles, special_region = self.res
yield region_eles
yield [special_region,]
else:
yield self.res
def data(self):
"""数据结果集生成器"""
region_name = None
for idx, selector_eles in enumerate(self._region_format()):
if idx == 0:
region_name = selector_eles[0].xpath('.//strong/text()').get()
# print(region_name)
cities = list()
for selector in selector_eles:
for city_name, city_link in zip(selector.xpath('.//a/text()'),selector.xpath('.//a/@href')):
cities.append((city_name.get(), city_link.get()))
for ins in cities:
# print(region_name, ins)
# 新房地址
temp1 = ins[-1].split('.')
temp1.insert(1, 'newhouse')
newhouse_link_prefix = '.'.join(temp1)
newhouse_link = newhouse_link_prefix + 'house/s/'
# 二手房地址
temp1[1] = 'esf'
oldhouse_link = '.'.join(temp1)
# print(region_name, ins[0], newhouse_link, oldhouse_link)
yield region_name, ins[0], newhouse_link, oldhouse_link
items.py ----数据实体
import scrapy
class NormalDataItem(scrapy.Item):
# 小区(楼盘)名
house_name = scrapy.Field()
# 建面
area = scrapy.Field()
# 地区
region_name = scrapy.Field()
# 城市
city_name = scrapy.Field()
class NewHouseItem(NormalDataItem):
# 地址
address = scrapy.Field()
# 居室类型
house_type = scrapy.Field()
# 更多信息页
detail_link = scrapy.Field()
# 售价
price = scrapy.Field()
# 联系电话
mobile = scrapy.Field()
# 状态
status = scrapy.Field()
# 标签
tags = scrapy.Field()
class SecondHandHouseItem(NormalDataItem):
# 标题
title = scrapy.Field()
# 楼层
floor = scrapy.Field()
# 朝向
direction = scrapy.Field()
# 房东
house_master = scrapy.Field()
# 地址
detail_addr = scrapy.Field()
# 房屋总价值
total_price = scrapy.Field()
# 单价
unit_price = scrapy.Field()