使用scrapy,拼接url,找到翻页参数,保存为json
若要实现真正的全部数据爬取,需要将筛选参数组合,这次做的只到默认100页
lj.py
-- coding: utf-8 --
import copy
import re
import time
import scrapy
from lianjia.items import LianjiaItem
class LjSpider(scrapy.Spider):
name = 'lj'
allowed_domains = ['lianjia.com']
start_urls = ['https://www.lianjia.com/city/']
cookies_str = """lianjia_uuid=47a174e2-625a-4b22-a3c2-25fd1ec31b81; _ga=GA1.2.1898977159.1552302637; _gid=GA1.2.96347100.1552302637; lianjia_ssid=95f26562-a063-4c30-a718-28df4f75bc9c; _smt_uid=5c864d54.1796444c; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1552305492; gr_user_id=f98cfb4a-0fd7-48b6-bd92-84b72f7cda35; gr_session_id_a1a50f141657a94e=50c2d270-7331-4957-9f3c-ea51bf6dc831; gr_session_id_a1a50f141657a94e_50c2d270-7331-4957-9f3c-ea51bf6dc831=true; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1552307519; select_city=341100; lj_newh_session=eyJpdiI6IjRmZGdtYjR6Q3FEK2RoVVVBbGRib3c9PSIsInZhbHVlIjoicGR4N3hyZVwvRHN2dEFaR3pBY1Jodm1QVkZ2QVFuSTdia0RcL1wvVUpCU2JjZnpxTWRNWE9JTWxWOG1OZUZWMU52bXdZQ1wveGk0cUViK1hrZFVYblwvVlpiQT09IiwibWFjIjoiN2ZhOGU5N2Y0YWVmOGIyYjRmM2I4YTdmNzQzNDMxMzk5N2ZlYjQzNmU1MzI3OTQ0YTM3YjE4NDhlMDRkZTM2NyJ9"""
cookies_dict = {i.split('=')[0]: i.split('=')[1] for i in cookies_str.split('; ')}
def parse(self, response):
city_list = response.xpath('//div/ul/li/div/div/ul/li/a')
for data in city_list:
time.sleep(1)
city_dict = LianjiaItem()
city_dict['name'] = data.xpath('./text()').extract_first()
city_dict['link'] = data.xpath('./@href').extract_first() + 'ershoufang/pg1co32/'
yield scrapy.Request(city_dict['link'], encoding='utf-8', cookies=self.cookies_dict,
callback=self.parse_city, meta={'city_h': copy.deepcopy(city_dict)})
def parse_city(self, response):
city_dict = response.meta['city_h']
hourse_list = response.xpath('//div[1]/ul/li/div[1]')
for hourse in hourse_list:
hoursing_area = hourse.xpath('./div[3]/div/a/text()').extract_first()
if hoursing_area:
city_dict['hoursing_area'] = hoursing_area
city_dict['hoursing_total_price'] = hourse.xpath('./div[6]/div[1]/span/text()').extract_first()
city_dict['hoursing_unit_price'] = hourse.xpath('./div[6]/div[2]/span/text()').extract_first()
yield city_dict
city_url = response.xpath('//body/div[1]/div/ul/li[2]/a/@href').extract_first()
page_total = re.findall(r'totalPage":(d{1,3})', response.text)
print(page_total)
if page_total:
pages = int(page_total[0])
# 发送 下一页的请求
for city_page in range(2, pages):
if city_page > 3:
return
page_url = city_url + '/pg{}co32/'.format(city_page)
yield scrapy.Request(page_url, callback=self.parse_city, meta={'city_h': city_dict})
pipelines.py
如果信息多的话用JsonLinesItemExporter保存更好
from scrapy.exporters import JsonLinesItemExporter, JsonItemExporter
class AqiJsonPipeline(object):
def open_spider(self, spider):
self.file = open('lianjia2.json', 'wb')
self.writer = JsonItemExporter(self.file, ensure_ascii=False, encoding='utf-8')
self.writer.start_exporting()
def process_item(self, item, spider):
self.writer.export_item(item)
return item
def close_spider(self, spider):
self.writer.finish_exporting()
self.file.close()
items.py
import scrapy
class LianjiaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
link = scrapy.Field()
hoursing_area = scrapy.Field()
hoursing_total_price = scrapy.Field()
hoursing_unit_price = scrapy.Field()