zoukankan      html  css  js  c++  java
  • python 抓取房源信息

    目前完成了我爱我家和链家的房源信息获取,写了比较的粗糙~

    houseWoaiwojia.py:

    #encoding=utf-8
    import result as r
    import db
    import datetime
    import houseLianjiaVo as vo
    import json
    import time
    
    
    
    def getdata(url):
        soup = r.getUrl(url)
        pList = soup.find_all("ul", class_="pList")
        all_li = pList[0].find_all("li")
        print("url:%s     li数量:%i"%(url,len(all_li)))
        list = []
        for i in all_li:
            listTit = i.find_all("h3", class_="listTit")
            if len(listTit) == 0 :
                continue
            title_a = listTit[0].find_all("a")[0]
            lazy_img = i.find_all("img", class_="lazy")[0]
            img = ""
            if "src" in lazy_img.attrs:
                img =  lazy_img["src"]
            elif "data-src" in lazy_img.attrs:
                img =  lazy_img["data-src"]
            href = domain + title_a["href"]
    
            code = json.loads(title_a["tdjson"])["content"]
            if code in arrayList:
                #print("code 已经存在")
                continue
            title = title_a.getText()
            try:
                listX_p = i.find_all("div",class_ = "listX")[0].find_all("p")
                houseInfo = listX_p[0].getText()
                region2 = listX_p[1].getText().split(" ")[0]
                region1 = listX_p[1].find_all("a")[0].getText()
                listX_p_2_test = listX_p[2].getText().split("·")
                release_time = listX_p_2_test[2]
                total_price = i.find_all("div", class_="jia")[0].find_all("strong")[0].getText()
                data_price  = i.find_all("div", class_="jia")[0].find_all("p")[1].getText()
    
                house_structure = total_square = orientation = decoration_degree = floor = material = ""
                if houseInfo != None :
                  h =  houseInfo.split("·")
                  if len(h) > 0 :
                      # 房屋结构
                      house_structure = h[0].strip()
                      # 总平方数
                      total_square = h[1].strip()
                      # 朝向
                      orientation = h[2].strip()
                      # 装修程度
                      decoration_degree = h[4].strip()
                      # 楼层
                      floor = h[3].strip()
                      # 材料
                      material = h[5].strip()
                create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                v = vo.houseLianjiaVo(code,img,title,href,region1,region2,
                                  house_structure,total_square,orientation,decoration_degree,floor,material,
                                  create_time,release_time,total_price,data_price)
                list.append(v)
                arrayList.append(code)
            except BaseException as Argument :
                print("失败,url:%s,title:%s",url,title,Argument)
    
        return  list
    
    
    
    def encapsulation_db(list):
        """dept表sql封装"""
        sql = """ insert into house  values"""
        for i in range(len(list)):
            d = list[i]
            s = """(null,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s'),"""%(
                house_type,
                d.code,
                d.img,
                d.title,
                d.href,
                d.region1,
                d.region2,
                d.house_structure,
                d.total_square,
                d.orientation,
                d.decoration_degree,
                d.floor,
                d.material,
                d.create_time,
                d.release_time,
                d.total_price,
                d.data_price
            )
            sql += s
        sql = sql [:len(sql)-1] + ";"
        return sql
    
    
    
    
    
    def date_util(date):
        print(date/1000)
        #时间有点问题差8小时
        dateArray = datetime.datetime.utcfromtimestamp(date/1000)
        otherStyleTime = dateArray.strftime("%Y-%m-%d %H:%M:%S")
        return otherStyleTime
    
    arrayList = []
    domain = "https://hz.5i5j.com"
    house_type ="我爱我家"
    if __name__ == '__main__':
        db = db.DataBaseHandle('127.0.0.1', 'root', '1qaz@WSX', 'house', 3306)
        publicity_list = []
        list = db.selectDb("select code from house where type ='%s'  "%house_type)
        for i in list :
            arrayList.append(i[0])
    
        for i in range(1,101):
            page = "n%s"%i
            url = domain + "/ershoufang/" + page
            list = getdata(url)
            if len(list) > 0 :
                sql =encapsulation_db(list)
                try:
                    db.insertDB(sql)
                    print("插入数据库成功%s",page)
                except BaseException:
                    print("插入数据库失败%s",page)

    houseLianjia.py:

    #encoding=utf-8
    import result as r
    import db
    import datetime
    import houseLianjiaVo as vo
    import time
    
    
    
    def getdata(url):
        soup = r.getUrl(url)
        # print(soup)
        sellListContent = soup.find_all("ul", class_="sellListContent")
        all_li = sellListContent[0].find_all("li")
        print("url:%s     li数量:%i"%(url,len(all_li)))
        list = []
        for i in all_li:
            title_a = i.find_all("div", class_="title")[0].find_all("a")[0]
            img = i.find_all("img", class_="lj-lazy")[0]["src"]
            href = title_a["href"]
            code = title_a["data-housecode"]
            if code in arrayList:
                #print("code 已经存在")
                continue
            title = title_a.getText()
            try:
                # print(title_a)
                positionInfo_a = i.find_all("div", class_="positionInfo")[0].find_all("a")
                region1 = positionInfo_a[0].getText()
                region2 = positionInfo_a[1].getText()
                houseInfo  = i.find_all("div", class_="houseInfo")[0].getText()
                house_structure = total_square = orientation = decoration_degree = floor = material = ""
                if houseInfo != None :
                  h =  houseInfo.split("|")
                  if len(h) > 0 :
                      # 房屋结构
                      house_structure = h[0]
                      # 总平方数
                      total_square = h[1]
                      # 朝向
                      orientation = h[2]
                      # 装修程度
                      decoration_degree = h[3]
                      # 楼层
                      floor = h[4]
                      # 材料
                      material = h[5]
    
                release_time = i.find_all("div" , class_ = "followInfo")[0].getText().split("/")[1]
                total_price = i.find_all("div", class_="totalPrice")[0].find_all("span")[0].getText()
                data_price = i.find_all("div", class_="unitPrice")[0]["data-price"]
                create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                v = vo.houseLianjiaVo(code,img,title,href,region1,region2,
                                  house_structure,total_square,orientation,decoration_degree,floor,material,
                                      create_time,release_time,total_price,data_price)
                list.append(v)
                arrayList.append(code)
            except BaseException:
                print("失败,url:%s,title:%s",url,title)
    
        return  list
    
    
    
    def encapsulation_db(list):
        """dept表sql封装"""
        sql = """ insert into house  values"""
        for i in range(len(list)):
            d = list[i]
            s = """(null,'%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s'),"""%(
                house_type,
                d.code,
                d.img,
                d.title,
                d.href,
                d.region1,
                d.region2,
                d.house_structure,
                d.total_square,
                d.orientation,
                d.decoration_degree,
                d.floor,
                d.material,
                d.create_time,
                d.release_time,
                d.total_price,
                d.data_price
            )
            # print(s)
            sql += s
        sql = sql [:len(sql)-1] + ";"
        return sql
    
    
    
    
    def date_util(date):
        print(date/1000)
        #时间有点问题差8小时
        dateArray = datetime.datetime.utcfromtimestamp(date/1000)
        otherStyleTime = dateArray.strftime("%Y-%m-%d %H:%M:%S")
        return otherStyleTime
    
    arrayList = []
    house_type = "链家"
    if __name__ == '__main__':
        db = db.DataBaseHandle('127.0.0.1', 'root', '1qaz@WSX', 'house', 3306)
        publicity_list = []
        list = db.selectDb("select code from house where type ='%s'  " % house_type)
        for i in list :
            arrayList.append(i[0])
    
        for i in range(1,101):
            page = "pg%s"%i
            url = "https://hz.lianjia.com/ershoufang/" + page + "co32/"
            list = getdata(url)
            if len(list) > 0 :
                sql =encapsulation_db(list)
                try:
                    db.insertDB(sql)
                    print("插入数据库成功%s",page)
                except BaseException:
                    print("插入数据库失败%s",page)

    result.py

    import requests
    from bs4 import BeautifulSoup
    import ip_list
    
    # 消息头
    # headers ={
    # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
    # "Accept-Encoding":"gzip, deflate, br",
    # "Accept-Language":"zh-CN,zh;q=0.9",
    # "Cache-Control":"max-age=0",
    # "Connection":"keep-alive",
    # "Cookie":"TS013af1c1=01ef8f99f1d0871b269c53340fa6269029504a80c4e2e4fb5f8c141dc3ea1889dfe9b73e3546aaef50810eb7137614f90423824a3a; _gscu_415563124=710350909btrml20; _gscbrs_415563124=1; TS01dde381_77=087968f3e8ab280075ffa9551835e2b3bc9a0a6ef1f753105fc148025ab082fd79a1cb86cc3a8f52e22695c031dd504308dd5f77e7823800bc61bb50494d8e87319bf3b42d4db90a536ed3feb83a77f2f122231d05b1d9d2348a3ef88a547a3a4fa01ac0c30c12acae8ea546e6c1a1ba; TSf97de9a7_76=087968f3e8ab2800694c407f83260b5fb36f147a01070c43e22a21076b5ec514c7b53bce88b8138a25acd182ef87b3ba08ef9bf94f07e800607868f728157db181454561ee310058f9e829ec0810c6cdcb21744ee6aac2d22d2d391d9dec7ed93dd2cc97f0534a13176b017915a82198365ab759a9c450c111a80907ed69974e36be3d3b9a2329829301cd8625d168c2f1b3b00c879662fc185e5c040d86ecabf8d9fd0d7582082883f0e4517e9ed01aef8fa6c301b7e34fba91950ff8a73444c94299ebebf81d60a295b2b378cb7f282d8c42bde8c1c6278b6e33bec5e77c19753bc6bf5a685fefd3e5bc832bf7b228faa342f439fdc647c4c009f2c59d7051f66d584aecb72f84a3a0ae4ad34e90593f62365471bf182f873e90c607771894; TSPD_101=087968f3e8ab2800694c407f83260b5fb36f147a01070c43e22a21076b5ec514c7b53bce88b8138a25acd182ef87b3ba:087968f3e8ab2800694c407f83260b5fb36f147a01070c43e22a21076b5ec514c7b53bce88b8138a25acd182ef87b3ba08ef9bf94f06300055e7cf7b71c21fcf57bb0d7a08e541b632d1e81bc2b89a1a0b150eb4c70f05a351fc3a1b4aa2c87583b1593295915bf8; a6c1b8e3d8ee43f7b55efdb3b44bd46e=WyIzNDA2NzQ2MzE0Il0; TS01dde381=01ef8f99f13c1ae3080e65ae71621810e05b79c0c6aa9bdd4bdd59420bc12c88c9cf07ae6c9dbbd39b34b715438e2a022c50ce917f; TSf97de9a7_27=087968f3e8ab2000afda1f036834422803e5ea6f78d56a8329a6c70c885298f80c660dbeab022de90855e1bd82092000ce04d01001dd1ea60427ae89da80b6ff03d8eafda5efd5e059e62400b4741136",
    # "Host":"www.12309.gov.cn",
    # "If-None-Match":'W/"10d1c-m2GHDG7mOl/LWWDK4ZcVfpV31es"',
    # "Sec-Fetch-Mode":"navigate",
    # "Sec-Fetch-Site":"none",
    # "Sec-Fetch-User":"?1",
    # "Upgrade-Insecure-Requests":"1",
    # "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36",
    # }
    headers ={}
    # post请求发送json
    def postUrl(url,json_data):
        result = requests.post(url ,json = json_data,headers=headers )
        return result.json()
    
    # post请求发送json
    def getUrlJson(url):
        result = requests.get(url ,headers=headers )
        return result.json()
    
    
    def getUrl(url):
        """url返回BeautifulSoup对象"""
        # proxies = ip_list.get_ip_list_random()#代理ip
        # print(proxies)
        # content = requests.get(url,headers=headers,proxies=proxies).content
        content = requests.get(url,headers=headers).content
        soup = BeautifulSoup(content,"html.parser")
        return soup

    db.py:

    import pymysql
    
    
    class DataBaseHandle(object):
        ''' 定义一个 MySQL 操作类'''
        def __init__(self,host,username,password,database,port):
            '''初始化数据库信息并创建数据库连接'''
            # 下面的赋值其实可以省略,connect 时 直接使用形参即可
            self.host = host
            self.username = username
            self.password = password
            self.database = database
            self.port = port
            self.db = pymysql.connect(self.host,self.username,self.password,self.database,self.port,charset='utf8')
    
    
    
        #  这里 注释连接的方法,是为了 实例化对象时,就创建连接。不许要单独处理连接了。
        #
        # def connDataBase(self):
        #     ''' 数据库连接 '''
        #
        #     self.db = pymysql.connect(self.host,self.username,self.password,self.port,self.database)
        #
        #     # self.cursor = self.db.cursor()
        #
        #     return self.db
    
    
    
    
    
        def insertDB(self,sql):
            ''' 插入数据库操作 '''
    
            self.cursor = self.db.cursor()
    
            try:
                # 执行sql
                self.cursor.execute(sql)
                # tt = self.cursor.execute(sql)  # 返回 插入数据 条数 可以根据 返回值 判定处理结果
                # print(tt)
                self.db.commit()
            except Exception as ex:
                # 发生错误时回滚
                self.db.rollback()
                print("数据库异常",ex)
            finally:
                self.cursor.close()
    
    
    
        def deleteDB(self,sql):
            ''' 操作数据库数据删除 '''
            self.cursor = self.db.cursor()
    
            try:
                # 执行sql
                self.cursor.execute(sql)
                # tt = self.cursor.execute(sql) # 返回 删除数据 条数 可以根据 返回值 判定处理结果
                # print(tt)
                self.db.commit()
            except:
                # 发生错误时回滚
                self.db.rollback()
                print("数据库异常")
            finally:
                self.cursor.close()
    
    
    
    
    
        def updateDb(self,sql):
            ''' 更新数据库操作 '''
    
            self.cursor = self.db.cursor()
    
            try:
                # 执行sql
                self.cursor.execute(sql)
                # tt = self.cursor.execute(sql) # 返回 更新数据 条数 可以根据 返回值 判定处理结果
                # print(tt)
                self.db.commit()
            except:
                # 发生错误时回滚
                self.db.rollback()
                print("数据库异常")
            finally:
                self.cursor.close()
    
    
    
    
    
        def selectDb(self,sql):
            ''' 数据库查询 '''
            self.cursor = self.db.cursor()
            try:
                self.cursor.execute(sql) # 返回 查询数据 条数 可以根据 返回值 判定处理结果
    
                data = self.cursor.fetchall() # 返回所有记录列表
    
                print(data)
    
                # 结果遍历
                # for row in data:
                #     sid = row[0]
                #     name = row[1]
                #     # 遍历打印结果
                #     print('sid = %s,  name = %s'%(sid,name))
                return data
            except:
                print("数据库异常")
            finally:
                self.cursor.close()
    
    
        def closeDb(self):
            ''' 数据库连接关闭 '''
            self.db.close()
    
    
    
    if __name__ == '__main__':
        DbHandle = DataBaseHandle('127.0.0.1','root','1qaz@WSX','test',3306)
    
    
        DbHandle.selectDb('select * from dept limit 5 ')
        DbHandle.closeDb()

    实体封装houseLianjiaVo.py

    class houseLianjiaVo:
        def __init__(self, code, img, title, href, region1, region2, house_structure, total_square, orientation,
                     decoration_degree, floor,material, create_time,release_time, total_price, data_price):
            self.id = None  # 自增id
            self.code = code  # 主键
            self.img = img  # 图片
            self.title = title  # 标题
            self.href = href  # 链接
            self.region1 = region1  # 小区名
            self.region2 = region2  # 区域
            self.house_structure = house_structure  # 房屋结构
            self.total_square = total_square  # 总平方数
            self.orientation = orientation  # 朝向
            self.decoration_degree = decoration_degree  # 装修程度
            self.floor = floor  # 楼层
            self.material = material  # 材料
            self.create_time = create_time  # 创建时间
            self.release_time = release_time  # 发布时间
            self.total_price = total_price  # 总价
            self.data_price = data_price  # 单价
    
    
    def __str__(self):
        return 'code:%s  title:%s  ' % (self.code, self.title)

    数据库表结构

    CREATE TABLE `house` (
      `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT COMMENT '自增id',
      `type` varchar(255) DEFAULT NULL,
      `code` varchar(32) DEFAULT NULL COMMENT '主键',
      `img` varchar(1000) DEFAULT NULL COMMENT '图片',
      `title` varchar(32) DEFAULT NULL COMMENT '标题',
      `href` varchar(256) DEFAULT NULL COMMENT '链接',
      `region1` varchar(32) DEFAULT NULL COMMENT '小区名',
      `region2` varchar(32) DEFAULT NULL COMMENT '区域',
      `house_structure` varchar(32) DEFAULT NULL COMMENT '房屋结构',
      `total_square` varchar(32) DEFAULT NULL COMMENT '总平方数',
      `orientation` varchar(32) DEFAULT NULL COMMENT '朝向',
      `decoration_degree` varchar(32) DEFAULT NULL COMMENT '装修程度',
      `floor` varchar(32) DEFAULT NULL COMMENT '楼层',
      `material` varchar(32) DEFAULT NULL COMMENT '材料',
      `create_time` varchar(32) DEFAULT NULL COMMENT '创建时间',
      `release_time` varchar(32) DEFAULT NULL COMMENT '发布时间',
      `total_price` varchar(32) DEFAULT NULL COMMENT '总价',
      `data_price` varchar(32) DEFAULT NULL COMMENT '单价',
      PRIMARY KEY (`id`) USING BTREE,
      KEY `idx_code` (`code`) USING BTREE COMMENT '编码索引'
    ) ENGINE=InnoDB AUTO_INCREMENT=14999 DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC COMMENT='房屋信息表-链家';
  • 相关阅读:
    牛客挑战赛45 D.坐标
    树上启发式合并(dsu on tree)合集
    2020HDU多校第二场 1012.String Distance
    2020HDU多校第一场 1009.Leading Robots
    2020牛客暑期多校训练营(第一场)H.Minimum-cost Flow
    自用综合线段树模板(区间加乘、区间置数、区间求和)
    ZOJ 4008.Yet Another Tree Query Problem(问题模型转化+线段树离线处理)
    最小费用最大流模板
    2020 CCPC Wannafly Winter Camp Day3.C. 无向图定向(k染色问题)
    2020牛客寒假算法基础集训营3.E.牛牛的随机数(数位dp拆位算贡献)
  • 原文地址:https://www.cnblogs.com/mytzq/p/12828245.html
Copyright © 2011-2022 走看看