import requests
from lxml import etree
from fake_useragent import UserAgent
import pandas as pd
from openpyxl import load_workbook
import time
ua = UserAgent()
cookies = {
'kztoken': 'nJail6zJp6iXaJqWmGtnYGVvYZeU',
'his': 'a%3A10%3A%7Bi%3A0%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVsZpqZ%22%3Bi%3A1%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVsZpuW%22%3Bi%3A2%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVsZpyV%22%3Bi%3A3%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVsZ5OU%22%3Bi%3A4%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVsZ5SZ%22%3Bi%3A5%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVsZ5qZ%22%3Bi%3A6%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVuZ5WU%22%3Bi%3A7%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVuapqa%22%3Bi%3A8%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVvYZSb%22%3Bi%3A9%3Bs%3A28%3A%22nJail6zJp6iXaJqWmGtnYGVvYZeU%22%3B%7D',
'hmap_show': 'true',
'_ga': 'GA1.2.1239476714.1625041333',
'_gid': 'GA1.2.1919533066.1625041333',
'yaozh_userId': '1106225',
'yaozh_uidhas': '1',
'yaozh_mylogin': '1625043247',
'UtzD_f52b_saltkey': 'A5nDU2YE',
'UtzD_f52b_lastvisit': '1625040594',
'yaozh_logintime': '1625044573',
'yaozh_user': '1106225%09shangzx',
'yaozh_jobstatus': 'kptta67UcJieW6zKnFSe2JyYnoaSaJVrl5aag26qb21rg66flM6bh5%2BscZhyVNbNw8%2FL3tlZoKifnZ%2BDn5iorJDVop6Yg3HYnmpnm1pjmZeD9Ca554583f9d1b099c0F042261C4F78XkZiXk2uVV6DXn5VtWamhnsZbbKabZ5ieW2iXameTmZaWm5iXZ55XoOE%3D208d011709412dc4d888156e6460cc04',
'db_w_auth': '900655%09shangzx',
'UtzD_f52b_lastact': '1625044575%09uc.php%09',
'UtzD_f52b_auth': '0d2eFXxmxST84XhMU0dko6H9kPGWqhw1XkgTI91OTE5V62q41qdQayOAS7NuWVAcWm2eknIFbZNUWXPNk%2B0eM3KVq%2F8',
'PHPSESSID': 'e1o7i5tm8pr5uo4uouphr5pe16',
'Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94': '1625041333,1625043148,1625044194,1625100919',
'Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94': '1625100942',
'acw_tc': '2f624a1616251051582993999e0f5553f53c4c3b47e273c6712cf34ea49775',
}
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': ua.random,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https://db.yaozh.com/hmap?grade=%E5%85%A8%E9%83%A8&p=19&pageSize=20&province=%E6%B2%B3%E5%8D%97&type=%E5%85%A8%E9%83%A8',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
params = (
('grade', '三级甲等'),
('p', '1'),
('pageSize', '30'),
('province', '河南'),
('type', '中医医院'),
)
response = requests.get('https://db.yaozh.com/hmap', headers=headers, params=params, cookies=cookies)
html = etree.HTML(response.text)
name = html.xpath('//a[@class="cl-blue "]/@href')
for hopstial_name_hrefs in name:
hopstial_name_href = 'https://db.yaozh.com' + hopstial_name_hrefs
# print(hopstial_name_href)
time.sleep(3)
code_response = requests.get(hopstial_name_href, headers=headers, cookies=cookies)
# code_response = requests.get('https://db.yaozh.com/hmap/90.html', headers=headers, cookies=cookies)
code_html = etree.HTML(code_response.text)
n = 0
hosp_list = ['','','','','','','','','','','','','']
title_list = ['省','市','县','医院名称','医院别名','医院等级','医院类型','负责人','经营方式','床位数','医院科室','电话','医院地址']
while True:
n += 1
try:
hosp_name_1 = code_html.xpath('//div[@class="table-wrapper"]/table/tbody/tr[{}]/th/text()'.format(n))[0].replace(' ','')
try:
hosp_name_2 = code_html.xpath('//div[@class="table-wrapper"]/table/tbody/tr[{}]/td/span/text()'.format(n))[1].replace(' ','').replace('
','')
print(hosp_name_1,hosp_name_2)
# print(title_list.index(hosp_name_1))
hosp_list[title_list.index(hosp_name_1)] = hosp_name_2
except:
pass
except:
break
hosp_tup = tuple(hosp_list)
print(hosp_tup)
# result2=[('a','2','ss'),('b','2','33'),('c','4','bbb')]#需要新写入的数据
df = pd.DataFrame([hosp_list],columns=['省','市','县','医院名称','医院别名','医院等级','医院类型','负责人','经营方式','床位数','医院科室','电话','医院地址'])#列表数据转为数据框
df1 = pd.DataFrame(pd.read_excel('hosp_list.xlsx',sheet_name='Sheet1')) #读取原数据文件和表
book=load_workbook('hosp_list.xlsx')
writer = pd.ExcelWriter('hosp_list.xlsx',engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
df_rows = df1.shape[0] #获取原数据的行数
df.to_excel(writer, sheet_name='Sheet1',startrow=df_rows+1, index=False, header=False)#将数据写入excel中的Sheet1表,从第一个空行开始写
writer.save()#保存