本人以前爬取过链家网的房价信息,所以爬取佛山的房价本应该是一件很简单的事情,但是在第31页遇到了隐藏代码,也就是style=display:none,隐藏了本应该有的下一页按钮,导致无法进入下一页
import requests from bs4 import BeautifulSoup from selenium import webdriver import time from lxml import etree import csv import sys import random class foshan(object): def info(self,url): user_list = [ "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7", "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10", "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)", "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5", "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Connection': 'keep-alive', 'Cookie': 'lianjia_uuid=c6a3fd6a-9e7d-40f7-ae69-30a22c362fe6; UM_distinctid=16893495b2e334-08bd8db0061c58-5d4e211f-e1000-16893495b30d3; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1551672970; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1551672970; _smt_uid=5c4eaa55.1e413923; _jzqa=1.2802375526935162000.1548659285.1548659285.1551672971.2; _jzqc=1; _jzqx=1.1548659285.1551672971.1.jzqsr=so%2Ecom|jzqct=/link.-; _jzqckmp=1; CNZZDATA1259272651=168210500-1551669489-https%253A%252F%252Ffs.lianjia.com%252F%7C1551669489; _ga=GA1.2.342095840.1548659297; _gid=GA1.2.1045432802.1551672976; CNZZDATA1254525948=140498007-1551671620-https%253A%252F%252Ffs.lianjia.com%252F%7C1551671620; CNZZDATA1255633284=62067122-1551672000-https%253A%252F%252Ffs.lianjia.com%252F%7C1551672000; CNZZDATA1255604082=749737680-1551667722-https%253A%252F%252Ffs.lianjia.com%252F%7C1551667722; _jzqa=1.2802375526935162000.1548659285.1548659285.1548819734.2; _jzqc=1; _qzja=1.535258400.1551672980661.1551672980661.1551672980662.1551672980661.1551672980662.0.0.0.1.1; _qzjc=1; _qzjto=1.1.0; select_city=440600', 'Host': 'fs.fang.lianjia.com', 'Referer': 'https://fs.lianjia.com/?utm_source=360&utm_medium=pinzhuan&utm_term=biaoti&utm_content=biaoti&utm_campaign=biaoti', 'User-Agent': random.choice(user_list) } rsp = requests.get(url, headers=headers) html = etree.HTML(rsp.text) soup = BeautifulSoup(rsp.text, 'lxml') name = html.xpath('//div[@class="resblock-name"]/a/text()') price = html.xpath('//div[@class="resblock-price"]/div[@class="main-price"]/span[1]/text()') loc = [] # 存储地址 for i in soup.find_all('div', class_='resblock-location'): location = i.get_text() location = location.strip() location = location.replace(' ', '') # 去掉换行符 loc.append(location) # print(location) for i,j,k in zip(name,loc,price): params=(i,j,k) print(params) writer.writerow(params) self.next(url) def next(self,url): brow=webdriver.Chrome('D:PythonScriptschromedriver.exe') brow.get(url) time.sleep(2) brow.execute_script('window.scrollTo(0,5000)') time.sleep(1) #点击下一页 # box1=brow.find_element_by_xpath('//div[@class="page-box"]/a[@class="next"]') #原来的静态页面代码隐藏了div[@class="page-box"]的信息,所以需要selenium加载动态页面后再次解析数据 html = brow.page_source soup=BeautifulSoup(html,'html.parser') html=etree.HTML(html) page =html.xpath('//div[@class="page-box"]/span[@class="active"]/text()') # soup.find(attrs={'div','display:none'}).extract # 调用这个方法,可以删除这一个标签 try: page=int(page[0])+1#这是下一页的页号 except: soup.body.div['style'] = "block"#使用bs4可以修改网页代码,但是在此处也没有用!!! # print(soup.body.div['style']) page = 32 #因此我们就直接输入32算了! # print(type(page)) # box1.click() #舍弃点击下一页的方法是因为第31页没有下一页的按钮 next_url='https://fs.fang.lianjia.com/loupan/pg'+str(page)+'rs%E4%BD%9B%E5%B1%B1/' # next_url=brow.current_url#获取当前url #但是很遗憾,使用selenium加载后依旧无法显示下一页的按钮,所以为了节省时间就不再尝试 #按照网络教程,应该我们需要通过javaScript修改display的值。但是暂时无法实现 brow.close() while page==49: sys.exit() self.info(next_url) if __name__=='__main__': url = 'https://fs.fang.lianjia.com/loupan/pg30rs%E4%BD%9B%E5%B1%B1/' file = open('foshan.csv', 'a', newline='', encoding='utf-8') # newline=''去掉行与行之间的空格 writer = csv.writer(file) writer.writerow(['名称', '地点', '价格']) foshan().info(url)