zoukankan      html  css  js  c++  java
  • 佛山链家-bs修改网页代码难题-待突破

    本人以前爬取过链家网的房价信息,所以爬取佛山的房价本应该是一件很简单的事情,但是在第31页遇到了隐藏代码,也就是style=display:none,隐藏了本应该有的下一页按钮,导致无法进入下一页

    import requests
    from bs4 import BeautifulSoup
    from selenium import webdriver
    import time
    from lxml import etree
    import csv
    import sys
    import random
    
    
    class foshan(object):
    
        def info(self,url):
            user_list = [
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
                "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
                "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
                "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
                "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
                "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
                "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
                "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
                "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
                "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"]
    
            headers = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Connection': 'keep-alive',
                'Cookie': 'lianjia_uuid=c6a3fd6a-9e7d-40f7-ae69-30a22c362fe6; UM_distinctid=16893495b2e334-08bd8db0061c58-5d4e211f-e1000-16893495b30d3; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1551672970; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1551672970; _smt_uid=5c4eaa55.1e413923; _jzqa=1.2802375526935162000.1548659285.1548659285.1551672971.2; _jzqc=1; _jzqx=1.1548659285.1551672971.1.jzqsr=so%2Ecom|jzqct=/link.-; _jzqckmp=1; CNZZDATA1259272651=168210500-1551669489-https%253A%252F%252Ffs.lianjia.com%252F%7C1551669489; _ga=GA1.2.342095840.1548659297; _gid=GA1.2.1045432802.1551672976; CNZZDATA1254525948=140498007-1551671620-https%253A%252F%252Ffs.lianjia.com%252F%7C1551671620; CNZZDATA1255633284=62067122-1551672000-https%253A%252F%252Ffs.lianjia.com%252F%7C1551672000; CNZZDATA1255604082=749737680-1551667722-https%253A%252F%252Ffs.lianjia.com%252F%7C1551667722; _jzqa=1.2802375526935162000.1548659285.1548659285.1548819734.2; _jzqc=1; _qzja=1.535258400.1551672980661.1551672980661.1551672980662.1551672980661.1551672980662.0.0.0.1.1; _qzjc=1; _qzjto=1.1.0; select_city=440600',
                'Host': 'fs.fang.lianjia.com',
                'Referer': 'https://fs.lianjia.com/?utm_source=360&utm_medium=pinzhuan&utm_term=biaoti&utm_content=biaoti&utm_campaign=biaoti',
                'User-Agent': random.choice(user_list)
            }
            rsp = requests.get(url, headers=headers)
            html = etree.HTML(rsp.text)
            soup = BeautifulSoup(rsp.text, 'lxml')
            name = html.xpath('//div[@class="resblock-name"]/a/text()')
            price = html.xpath('//div[@class="resblock-price"]/div[@class="main-price"]/span[1]/text()')
            loc = []  # 存储地址
            for i in soup.find_all('div', class_='resblock-location'):
                location = i.get_text()
                location = location.strip()
                location = location.replace('
    ', '')  # 去掉换行符
                loc.append(location)
                # print(location)
            for i,j,k in zip(name,loc,price):
                params=(i,j,k)
                print(params)
                writer.writerow(params)
    
            self.next(url)
    
        def next(self,url):
            brow=webdriver.Chrome('D:PythonScriptschromedriver.exe')
            brow.get(url)
            time.sleep(2)
            brow.execute_script('window.scrollTo(0,5000)')
            time.sleep(1)
            #点击下一页
            # box1=brow.find_element_by_xpath('//div[@class="page-box"]/a[@class="next"]')
            #原来的静态页面代码隐藏了div[@class="page-box"]的信息,所以需要selenium加载动态页面后再次解析数据
            html = brow.page_source
            soup=BeautifulSoup(html,'html.parser')
            html=etree.HTML(html)
            page =html.xpath('//div[@class="page-box"]/span[@class="active"]/text()')
            # soup.find(attrs={'div','display:none'}).extract  # 调用这个方法,可以删除这一个标签
            try:
                page=int(page[0])+1#这是下一页的页号
            except:
                soup.body.div['style'] = "block"#使用bs4可以修改网页代码,但是在此处也没有用!!!
                # print(soup.body.div['style'])
                page = 32     #因此我们就直接输入32算了!
            # print(type(page))
            # box1.click()
            #舍弃点击下一页的方法是因为第31页没有下一页的按钮
            next_url='https://fs.fang.lianjia.com/loupan/pg'+str(page)+'rs%E4%BD%9B%E5%B1%B1/'
            # next_url=brow.current_url#获取当前url
    
            #但是很遗憾,使用selenium加载后依旧无法显示下一页的按钮,所以为了节省时间就不再尝试
            #按照网络教程,应该我们需要通过javaScript修改display的值。但是暂时无法实现
            brow.close()
            while page==49:
                sys.exit()
            self.info(next_url)
    
    
    if __name__=='__main__':
        url = 'https://fs.fang.lianjia.com/loupan/pg30rs%E4%BD%9B%E5%B1%B1/'
        file = open('foshan.csv', 'a', newline='', encoding='utf-8')
        # newline=''去掉行与行之间的空格
        writer = csv.writer(file)
        writer.writerow(['名称', '地点', '价格'])
        foshan().info(url)
    

      

  • 相关阅读:
    Linux运维常用的几个命令介绍【转】
    Linux 删除文件后空间不释放【原创】
    使用 Xtrabackup 在线对MySQL做主从复制【转】
    用Centos7搭建小微企业Samba文件共享服务器【转】
    工作流数据表设计
    mysql函数大全
    git 分支管理
    Bootstap datetimepicker报错TypeError: intermediate value
    分分钟搞定IOS远程消息推送
    Windows10下安装OpenSSL
  • 原文地址:https://www.cnblogs.com/fodalaoyao/p/10474901.html
Copyright © 2011-2022 走看看