import json import requests from lxml import etree from time import sleep url = "https://sz.lianjia.com/ershoufang/rs/" headers = { "User-Agent":"", "Refer":"https://sz.lianjia.com/ershoufang/pg2/" } resp = requests.get(url,headers=headers) base_url = "https://sz.lianjia.com/ershoufang/pg{}/" html = etree.HTML(resp.text) data = html.xpath('//*[@id="content"]//div[@class="page-box fr"]/div/@page-data') data = json.loads(data[0]) totalPage = data['totalPage'] curPage = data['curPage'] def get_data(url): list = [] resp = requests.get(url, headers=headers) html = etree.HTML(resp.text) ul = html.xpath('.//ul[@class="sellListContent"]/li') for li in ul: face = li.xpath('./a/img/@src') title = li.xpath('.//div[@class="title"]/a/text()') position = li.xpath('.//div[@class="positionInfo"]/a/text()') house_info = li.xpath('.//div[@class="houseInfo"]/text()') follow_info = li.xpath('.//div[@class="followInfo"]/text()') price = li.xpath('.//div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()') unit_price = li.xpath('.//div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()') tag = li.xpath('.//div[@class="tag"]//span/text()') content = {} content["face"] = face[0] content["title"] = title[0] content["position"] = position[0] content["house_info"] = house_info[0] content["follow_info"] = follow_info[0] content["price"] = price[0] content["unit_price"] = unit_price[0] if len(tag) >=1 and tag[0] is not None: content['tag'] = tag[0] list.append(content) return list totalList = [] for i in range(1,totalPage+1): url = base_url.format(i) print("crawl url " + url) cur_list = get_data(url) print(cur_list) totalList = totalList + cur_list url = base_url.format(1) print(totalList)