zoukankan      html  css  js  c++  java
  • 爬取链家二手房

    1.将爬取出来的数据存储到scv, mysql, mongo数据库中,利用正则的方式爬取。

    import csv
    import warnings
    
    import pymongo
    import pymysql
    import requests
    import re
    
    
    class LIANJIA:
    
        def __init__(self):
            self.url = "https://%s.lianjia.com/ershoufang/"
            # self.proxies = {"HTTP", "http://61.152.248.147:80"}
            self.proxies = {"HTTP": "http://116.255.162.107:16816"}
            self.headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3100.0 Safari/537.36"}
    
            # 连接pymysql数据库
            # self.db = pymysql.connect("localhost", "root", "123456", charset="utf8")
            # self.cursor = self.db.cursor()
    
            # 连接mongo数据库
            self.conn = pymongo.MongoClient("localhost", 27017)
            self.db = self.conn["lianjia"]
            self.tab = self.db.lianjiafang
    
        def getHtml(self, url):
            response = requests.get(url, proxies=self.proxies, headers=self.headers)
            response.encoding = "utf-8"
            html = response.text
            self.parse(html)
    
        # 利用正则来匹配数据
        def parse(self, html):
            re_str = '<div class="info clear">.*?data-el="region">(.*?)</a>.*?class="totalPrice"><span>(.*?)</span>.*?<span>(.*?)</span></div>'
            p = re.compile(re_str, re.S)
            # 利用正则匹配,返回的是一个集合列表
            result_list = p.findall(html)
            print(result_list)
            self.saveTomongo(result_list)
    
        # 存入csv
        def saveTocsv(self, result_list):
            for result in result_list:
                print(result)
                with open("lianjia.csv", "a", newline="") as f:
                    writer = csv.writer(f)
                    writer.writerow(result)
    
        # 将数据存入mysql数据库中
        def saveTomysql(self, result_list):
            cd_db = "create database if not exists lianjia charset utf8"
            u_db = "use lianjia"
            c_tab = "create table if not exists lianjiafang(
                                id int primary key auto_increment,
                                name varchar(100),
                                price varchar(100),
                                sq_mPrice varchar(100)
                                )"
            ins = "insert into lianjiafang(name, price, sq_mPrice)
                            values(%s, %s, %s)"
            warnings.filterwarnings("ignore")
            try:
                self.cursor.execute(cd_db)
                self.cursor.execute(u_db)
                self.cursor.execute(c_tab)
            except:
                pass
            # 插入记录
            for result_tuple in result_list:
                L = [result_tuple[0].strip(), int(result_tuple[1].strip())*10000, result_tuple[2].strip()]
                # execute(ins, [列表])
                self.cursor.execute(ins, L)
                self.db.commit()
                print("OK")
    
        # 存入mongo
        def saveTomongo(self, result_list):
            for result_tuple in result_list:
                name = result_tuple[0].strip()
                price = int(result_tuple[1].strip()) * 10000
                sq_mPrice = result_tuple[2].strip()
                d = {"name":name, "star":price, "time":sq_mPrice}
                self.tab.insert(d)
            print("OK")
    
        def workOn(self):
            city = input("请输入你要搜索的城市首拼音:")
            end = int(input("爬取多少页:"))
            for x in range(1, end+1):
                if x == 1:
                    url = self.url % city
                    self.getHtml(url)
                else:
                    url = (self.url % city) + "pg" + str(x) + "/"
                    self.getHtml(url)
            # self.cursor.close()
            # self.db.close()
    
    
    if __name__ == "__main__":
        lianjia = LIANJIA()
        lianjia.workOn()

    得到csv中的结果,mysql和mongo结果就不显示了:

  • 相关阅读:
    企业IT管理员IE11升级指南【7】—— Win7和Win8.1上的IE11功能对比
    Microsoft Azure Web Sites应用与实践【3】—— 通过Visual Studio Online在线编辑Microsoft Azure 网站
    Microsoft Azure Web Sites应用与实践【2】—— 通过本地IIS 远程管理Microsoft Azure Web Site
    万以内的数字转繁体
    你的生产力工具集
    nodejs保存图片至本地
    设置input 中placeholder的样式
    微信支付报调用支付JSAPI缺少参数: sign
    vuepress项目 配置/使用/部署 markdown语法
    移动端自适应js
  • 原文地址:https://www.cnblogs.com/zengsf/p/10022159.html
Copyright © 2011-2022 走看看