zoukankan      html  css  js  c++  java
  • 寻医问药 爬虫

    import requests
    import re
    import pandas as pd
    
    def get_all_date_url():
        all_url=[]
        for i in range(61):
            url = 'http://club.xywy.com/keshi/{}.html'.format(str(i+1))
            res = requests.get(url)
            urls=re.findall(r"http://club.xywy.com/keshi/d{4}-d{2}-d+/d+.html",res.text)
            all_url.extend(urls)
        return list(set(all_url))
    
    def get_QA_url(url):
        all_QA_url=[]
        res = requests.get(url)
        res.encoding = 'gb2312'
        all_page = re.findall(r'共 (d+) 页',res.text)[0]
        for i in range(int(all_page)):
            url1 = 'http://club.xywy.com/keshi/'+ url.split('/')[-2] + '/' + str(i+1) +'.html'
            all_QA_url.append(url1)
        return list(set(all_QA_url))
    
    def main():
        all_url_data = []
        for i in get_all_date_url():
            all_url_data.extend(get_QA_url(i))
    
        info_list = []
        for detail_url in all_url_data:
            final_dic_data = {}
            final_dic_data['url']=detail_url
            final_dic_data['患者标题']=xx
            final_dic_data['患者姓名']=xx
            final_dic_data['患者性别']=xx
            final_dic_data['提问日期']=xx
            final_dic_data['患者描述']=xx
            final_dic_data['医生姓名']=xx
            final_dic_data['医生职称']=xx
            final_dic_data['医生科室']=xx
            final_dic_data['问题分析']=xx
            final_dic_data['回答时间']=xx
            info_list.append(final_dic_data)
    
        df =pd.DataFrame(info_list)
        df.to_excel('xunyiwenyao.xlsx',index=False)
    
    if __name__ == '__main__':
        mian()
  • 相关阅读:
    001_jdk配置
    mysql(5.7)安装教程
    mysql(5.6)安装教程
    外网发布
    蓝桥 历届试题 分考场
    蓝桥 历届试题 合根植物
    Codeforces Round #650 (Div. 3) D : Task On The Board
    HDU 3336 Count the string
    leetcode [238. 除自身以外数组的乘积]
    leetcode [837. 新21点]
  • 原文地址:https://www.cnblogs.com/Erick-L/p/8311825.html
Copyright © 2011-2022 走看看