zoukankan      html  css  js  c++  java
  • 顺企网 爬取16W数据保存到Mongodb

    import requests
    from bs4 import BeautifulSoup
    import pymongo
    from multiprocessing.dummy import Pool as ThreadPool
    
    headers = {'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'}
    
    # 定义数据库
    client = pymongo.MongoClient('localhost',27017)
    conpany_info = client['conpany_info']  # 给数据库命名
    sheet_table = conpany_info['sheet_table']  # 创建表单
    
    def jiexi(url):
        info = {}
        res = requests.get(url,headers=headers)
        if res.status_code != 404:
            soup = BeautifulSoup(res.text,'lxml')
            key = [b.text for b in soup.select('.codl dd')]
            value = [p.text for p in soup.select('.codl dt')]
            for k,v in zip(value,key):
                info[k.strip('')] = v
            return info
    
    urls = ('https://m.11467.com/jinan/co/{}.htm'.format(str(i)) for i in range(2,160998))
    
    def get_all_data(url):
        try:
            result = jiexi(url)
            if result:
                sheet_table.insert(result)
                print ('获取了 ' + str (sheet_table.find ().count ()) + '条数据')
        except Exception as e :
            print(e,url)
    if __name__ == "__main__":
        pool = ThreadPool(4)
        results = pool.map(get_all_data,urls)
        pool.close()
        pool.join()
  • 相关阅读:
    shell
    RANDOM随机数
    docker网络管理
    Oracle-28001密码过期问题及28000账户被锁解决
    Oracle数据泵导入导出(expdb/impdb)
    mysql多实例部署
    sed命令基本使用
    MySQL5.7.x二进制安装
    每日日报
    每日日报
  • 原文地址:https://www.cnblogs.com/Erick-L/p/7028032.html
Copyright © 2011-2022 走看看