zoukankan      html  css  js  c++  java
  • python爬虫数据追加至excel中

    import requests
    from lxml import etree
    from fake_useragent import UserAgent
    import pandas as pd
    from openpyxl import load_workbook
    import time
    
    
    ua = UserAgent()
    cookies = {
        'kztoken': 'nJail6zJp6iXaJqWmGtnYGVvYZeU',
        'his': 'a%3A10%3A%7Bi%3A0%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVsZpqZ%22%3Bi%3A1%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVsZpuW%22%3Bi%3A2%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVsZpyV%22%3Bi%3A3%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVsZ5OU%22%3Bi%3A4%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVsZ5SZ%22%3Bi%3A5%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVsZ5qZ%22%3Bi%3A6%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVuZ5WU%22%3Bi%3A7%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVuapqa%22%3Bi%3A8%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVvYZSb%22%3Bi%3A9%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVvYZeU%22%3B%7D',
        'hmap_show': 'true',
        '_ga': 'GA1.2.1239476714.1625041333',
        '_gid': 'GA1.2.1919533066.1625041333',
        'yaozh_userId': '1106225',
        'yaozh_uidhas': '1',
        'yaozh_mylogin': '1625043247',
        'UtzD_f52b_saltkey': 'A5nDU2YE',
        'UtzD_f52b_lastvisit': '1625040594',
        'yaozh_logintime': '1625044573',
        'yaozh_user': '1106225%09shangzx',
        'yaozh_jobstatus': 'kptta67UcJieW6zKnFSe2JyYnoaSaJVrl5aag26qb21rg66flM6bh5%2BscZhyVNbNw8%2FL3tlZoKifnZ%2BDn5iorJDVop6Yg3HYnmpnm1pjmZeD9Ca554583f9d1b099c0F042261C4F78XkZiXk2uVV6DXn5VtWamhnsZbbKabZ5ieW2iXameTmZaWm5iXZ55XoOE%3D208d011709412dc4d888156e6460cc04',
        'db_w_auth': '900655%09shangzx',
        'UtzD_f52b_lastact': '1625044575%09uc.php%09',
        'UtzD_f52b_auth': '0d2eFXxmxST84XhMU0dko6H9kPGWqhw1XkgTI91OTE5V62q41qdQayOAS7NuWVAcWm2eknIFbZNUWXPNk%2B0eM3KVq%2F8',
        'PHPSESSID': 'e1o7i5tm8pr5uo4uouphr5pe16',
        'Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94': '1625041333,1625043148,1625044194,1625100919',
        'Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94': '1625100942',
        'acw_tc': '2f624a1616251051582993999e0f5553f53c4c3b47e273c6712cf34ea49775',
    }
    
    headers = {
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Cache-Control': 'no-cache',
        'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
        'sec-ch-ua-mobile': '?0',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': ua.random,
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Sec-Fetch-Site': 'same-origin',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-User': '?1',
        'Sec-Fetch-Dest': 'document',
        'Referer': 'https://db.yaozh.com/hmap?grade=%E5%85%A8%E9%83%A8&p=19&pageSize=20&province=%E6%B2%B3%E5%8D%97&type=%E5%85%A8%E9%83%A8',
        'Accept-Language': 'zh-CN,zh;q=0.9',
    }
    
    params = (
        ('grade', '三级甲等'),
        ('p', '1'),
        ('pageSize', '30'),
        ('province', '河南'),
        ('type', '中医医院'),
    )
    
    response = requests.get('https://db.yaozh.com/hmap', headers=headers, params=params, cookies=cookies)
    
    html = etree.HTML(response.text)
    name = html.xpath('//a[@class="cl-blue "]/@href')
    for hopstial_name_hrefs in name:
        hopstial_name_href = 'https://db.yaozh.com' + hopstial_name_hrefs
        # print(hopstial_name_href)
        time.sleep(3)
        code_response = requests.get(hopstial_name_href, headers=headers, cookies=cookies)
        # code_response = requests.get('https://db.yaozh.com/hmap/90.html', headers=headers, cookies=cookies)
        code_html = etree.HTML(code_response.text)
        n = 0
        hosp_list = ['','','','','','','','','','','','','']
        title_list = ['省','市','县','医院名称','医院别名','医院等级','医院类型','负责人','经营方式','床位数','医院科室','电话','医院地址']
        while True:
            n += 1
            try:
                hosp_name_1 = code_html.xpath('//div[@class="table-wrapper"]/table/tbody/tr[{}]/th/text()'.format(n))[0].replace(' ','')
                try:
                    hosp_name_2 = code_html.xpath('//div[@class="table-wrapper"]/table/tbody/tr[{}]/td/span/text()'.format(n))[1].replace(' ','').replace('
    ','')
                    print(hosp_name_1,hosp_name_2)
                    # print(title_list.index(hosp_name_1))
                    hosp_list[title_list.index(hosp_name_1)] = hosp_name_2
                except:
                    pass
            except:
                break
        hosp_tup = tuple(hosp_list)
        print(hosp_tup)
        # result2=[('a','2','ss'),('b','2','33'),('c','4','bbb')]#需要新写入的数据
        df = pd.DataFrame([hosp_list],columns=['省','市','县','医院名称','医院别名','医院等级','医院类型','负责人','经营方式','床位数','医院科室','电话','医院地址'])#列表数据转为数据框
        df1 = pd.DataFrame(pd.read_excel('hosp_list.xlsx',sheet_name='Sheet1')) #读取原数据文件和表
        book=load_workbook('hosp_list.xlsx')
        writer = pd.ExcelWriter('hosp_list.xlsx',engine='openpyxl')
        writer.book = book
        writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
        df_rows = df1.shape[0] #获取原数据的行数
        df.to_excel(writer, sheet_name='Sheet1',startrow=df_rows+1, index=False, header=False)#将数据写入excel中的Sheet1表,从第一个空行开始写
        writer.save()#保存
  • 相关阅读:
    Scala中隐式转换(implicit conversion)的优先顺序
    protege4.0使用中的理论
    国外程序员整理的 C++ 资源大全
    什么是本体论?
    深入分析C++引用
    在基于对话框的MFC程序中,使程序在任务栏中不显示图标
    PhoneGap搭建运行环境(3.2版本)
    [JS代码]如何判断ipad或者iphone是否为横屏或者竖屏
    windwos iis 7.5 使用html 报405错误
    NodeJs 开源
  • 原文地址:https://www.cnblogs.com/bear-king/p/14961911.html
Copyright © 2011-2022 走看看