zoukankan      html  css  js  c++  java
  • 数据分析案例之39药品网

    前期准备

    获取39药品网所有药品ID

        1.下载39药品网所有药品页面

    # -*- coding: utf-8 -*-
    
    """
    @Datetime: 2018/10/15
    @Author: Zhang Yafei
    """
    import re
    import logging
    import requests
    import os
    import time
    from retrying import retry
    from urllib.request import urljoin
    from urllib.parse import urlsplit
    # from scrapy import Selector
    from lxml import etree
    # from fake_useragent import UserAgent
    from multiprocessing import Pool
    from ids import Diabetes_ids
    
    # ua = UserAgent()
    # headers = {'User-Agent':ua.random}
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
    proxies = {'http':'http://61.135.217.7:80','https':'http://171.113.156.168:8010'}
    
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    # DOWNLOAD_DIR = os.path.join(BASE_DIR,'药品')
    DOWNLOAD_DIR = os.path.join(BASE_DIR,'糖尿病')
    file_path = os.path.join(BASE_DIR,'drug_ruls.txt')
    
    RUN_LOG_FILE = os.path.join(BASE_DIR,'log','run.log')
    ERROR_LOG_FILE = os.path.join(BASE_DIR,'log','error_log')
    
    
    if not os.path.exists(DOWNLOAD_DIR):
        os.makedirs(DOWNLOAD_DIR)
    
    
    class Logger(object):
        """
        logger对象:打印日志
        """
        def __init__(self):
            self.run_log_file = RUN_LOG_FILE
            self.error_log_file = ERROR_LOG_FILE
            self.run_log = None
            self.error_log = None
    
            self.initialize_run_log()
            self.initialize_error_log()
    
        @staticmethod
        def check_path_exist(log_abs_file):
            log_path = os.path.split(log_abs_file)[0]
            if not os.path.exists(log_path):
                os.mkdir(log_path)
    
        def initialize_run_log(self):
            self.check_path_exist(self.run_log_file)
            fh = logging.FileHandler(self.run_log_file, 'a', encoding='utf-8')
            sh = logging.StreamHandler()
            # fmt = logging.Formatter(fmt="%(asctime)s - %(levelname)s :  %(message)s")
            # fh.setFormatter(fmt)
            # sh.setFormatter(fmt)
            logger1 = logging.Logger('run_log', level=logging.INFO)
            logger1.addHandler(fh)
            logger1.addHandler(sh)
            self.run_logger = logger1
    
        def initialize_error_log(self):
            self.check_path_exist(self.error_log_file)
            fh = logging.FileHandler(self.error_log_file, 'a', encoding='utf-8')
            sh = logging.StreamHandler()
            # fmt = logging.Formatter(fmt="%(asctime)s  - %(levelname)s :  %(message)s")
            # fh.setFormatter(fmt)
            # sh.setFormatter(fmt)
            logger1 = logging.Logger('error_log', level=logging.ERROR)
            logger1.addHandler(fh)
            logger1.addHandler(sh)
            self.error_logger = logger1
    
        def log(self, message, mode=True):
            """
            写入日志
            :param message: 日志信息
            :param mode: True表示运行信息,False表示错误信息
            :return:
            """
            if mode:
                self.run_logger.info(message)
            else:
                self.error_logger.error(message)
    
    logger = Logger()
    
    class Drug(object):
        """
        self.base_url  药物抓取基础url=药品概述url
        self.manual_url 药品详细说明书url
        self.comment_url 药品用药经验url
        self.ask_url     药品咨询url
        self.logger     打印日志
        """
        def __init__(self,base_url):
            self.base_url = base_url
            self.drug_id = self.base_url.split('/')[-2]
            self.manual_url = urljoin(base_url,'manual')
            self.comment_url = urljoin(base_url,'comment')
            self.ask_url = urljoin(base_url,'ask')
    
            self.make_drug_dir()
            method_list = [self.summary,self.manual,self.comment,self.ask]
            map(lambda x:x,[x() for x in method_list])
    
        def make_drug_dir(self):
            """
            创建每一种药品所有网页文件文件夹
            :return:
            """
            # self.check_download_dir()
            response = requests.get(self.base_url,headers=headers)
            response.encoding = response.apparent_encoding
            html = etree.HTML(response.text)
            # selector = Selector(response)
            # drug_name = selector.css('.t1 h1 a::text').extract_first()
            try:
                drug_name = html.xpath('//div[@class="t1"]/h1/a/text()')[0]
            except IndexError:
                drug_name = html.xpath('//div[@class="t1"]/h1/text()')[0]
            self.drug_name = self.validateTitle(drug_name)
            self.drug_dir_path = os.path.join(DOWNLOAD_DIR,'{}[{}]'.format(self.drug_name,self.drug_id))
            if not os.path.exists(self.drug_dir_path):
                os.mkdir(self.drug_dir_path)
    
        def validateTitle(self,title):
            rstr = r"[/\:*?"<>|]"  # '/  : * ? " < > |'
            new_title = re.sub(rstr, "_", title)  # 替换为下划线
            return new_title
    
        @retry(stop_max_attempt_number=3)
        def retry_download(self, url):
            """
            通过装饰器封装重试下载模块,最多重试三次
            :param url_str: 下载网页的最终地址
            :param data: Post传输数据
            :param method: 下载方法GET或POST
            :param proxies: 代理服务器
            :return: 下载结果
            """
            result = requests.get(url, headers=headers, proxies=proxies,timeout=3)
            assert result.status_code == 200  # 使用断言判断下载状态,成功则返回结果,失败抛出异常
            return result
    
        def download(self, url):
            """
            真正的下载类,代理模式
            :param url_str:下载的链接
            :param data:post需要传输的数据
            :param method:请求方法
            :param proxies:代理
            :return:下载的结果
            """
            try:
                result = self.retry_download(url)
            except Exception as e:  # 异常处理尽量使用具体的异常
                print(e)
                # logger.log(url,False)
                result = None
            return result
    
        def summary(self):
            """
            抓取药品概述页
            :return:
            """
            summary_path = os.path.join(self.drug_dir_path,'{}[{}]-药品概述.html'.format(self.drug_name,self.drug_id))
            if os.path.exists(summary_path):
                print('{}药品概述已经下载过了'.format(self.drug_name))
            else:
                response = requests.get(self.base_url,headers=headers)
                if response.status_code != 200:
                    response = self.download(self.base_url)
                    if not response:
                        # self.logger.log('{}[{}]-药品概述下载失败-{}'.format(self.drug_name,self.drug_id,self.base_url),False)
                        logger.log('{}'.format(self.base_url),False)
                        return
                response = response.content.decode('gb2312','ignore')
                with open(summary_path,'w',encoding='gb2312') as file:
                    file.write(response)
                logger.log('{}[{}]-药品概述下载完成'.format(self.drug_name,self.drug_id))
    
        def manual(self):
            """
            抓取药品详细说明书
            :return:
            """
            manual_path = os.path.join(self.drug_dir_path,'{}[{}]-详细说明书.html'.format(self.drug_name,self.drug_id))
            if os.path.exists(manual_path):
                print('{}详细说明书已经下载过了'.format(self.drug_name))
            else:
                response = requests.get(self.manual_url,headers=headers)
                if response.status_code != 200:
                    response = self.download(self.base_url)
                    if not response:
                        # self.logger.log('{}[{}]-详细说明书下载失败-{}'.format(self.drug_name,self.drug_id,self.manual_url),False)
                        logger.log('{}'.format(self.manual_url),False)
                        return
                response = response.content.decode('gb2312','ignore')
                with open(manual_path,'w',encoding='gb2312') as file:
                    file.write(response)
                logger.log('{}[{}]-详细说明书下载完成'.format(self.drug_name,self.drug_id))
    
        def comment(self):
            """
            药品用药经验页
            :return:
            """
            response = requests.get(self.comment_url,headers=headers)
            if response.status_code != 200:
                response = self.download(self.base_url)
                if not response:
                    # self.logger.log('{}[{}]-用药经验下载失败'.format(self.drug_name,self.drug_id,self.comment_url),False)
                    logger.log('{}'.format(self.comment_url),False)
                    return
            response = response.content.decode('gb2312','ignore')
            html = etree.HTML(response)
            try:
                comment_nums = int(html.xpath('//div[@class="dps"]/cite/font/text()')[0])
            except IndexError as e:
                logger.log('{}[{}]-用药经验页评论数为零'.format(self.drug_name,self.drug_id))
                comment_nums = 0
            # selector = Selector(response)
            # comment_nums = int(selector.css('.dps cite font::text').extract_first())
            num,remainder = divmod(comment_nums,20)
            for x in range(1,num+2):
                url = urljoin(self.base_url,'comment/k0_p{}'.format(x))
                self.comment_page(url)
    
        def comment_page(self,url):
            """
            抓取用药经验详情页
            :param url:
            :return:
            """
            comment_path = os.path.join(self.drug_dir_path,'{}[{}]-用药经验{}.html'.format(self.drug_name,self.drug_id,url[-1]))
            if os.path.exists(comment_path):
                print('{}[{}]-用药经验{}已经下载过了'.format(self.drug_name,self.drug_id,url[-1]))
            else:
                response = requests.get(url,headers=headers)
                if response.status_code != 200:
                    response = self.download(self.base_url)
                    if not response:
                        # self.logger.log('{}[{}]-用药经验{}下载失败-{}'.format(self.drug_name,self.drug_id,url[-1],url),False)
                        logger.log('{}'.format(url),False)
                        return
                response = response.content.decode('gb2312','ignore')
                with open(comment_path,'w',encoding='gb2312') as file:
                    file.write(response)
                logger.log('{}[{}]-用药经验{}下载完成'.format(self.drug_name,self.drug_id,url[-1]))
    
        def ask(self):
            """
            药品用药咨询页
            :return:
            """
            response = requests.get(self.ask_url)
            if response.status_code != 200:
                response = self.download(self.base_url)
                if not response:
                    # self.logger.log('{}[{}]-用药咨询下载失败-{}'.format(self.drug_name,self.drug_id,self.ask_url),False)
                    logger.log('{}'.format(self.ask_url),False)
                    return
            response = response.content.decode('gb2312','ignore')
            html = etree.HTML(response)
            try:
                ask_nums = html.xpath('//span[@class="pages"]/span[@class="pgleft"]/b/text()')[0]
                ask_nums = int(re.match('.*?(d+).*',ask_nums).group(1))
            except Exception as e:
                ask_nums = 0
                logger.log('{}[{}]-用药咨询页无人提问'.format(self.drug_name,self.drug_id))
            # selector = Selector(response)
            # ask_nums = int(selector.css('.pages .pgleft b::text').re('d+')[0])
            num,remainder = divmod(ask_nums,5)
            for x in range(1,num+2):
                url = urljoin(self.base_url,'ask/p{}'.format(x))
                self.ask_page(url)
    
        def ask_page(self,url):
            """
            抓取用药咨询详情页
            :param url:
            :return:
            """
            ask_path = os.path.join(self.drug_dir_path,'{}[{}]-用药咨询{}.html'.format(self.drug_name,self.drug_id,url[-1]))
            if os.path.exists(ask_path):
                print('{}[{}]-用药咨询{}已经下载过了'.format(self.drug_name,self.drug_id,url[-1]))
            else:
                response = requests.get(url,headers=headers)
                if response.status_code != 200:
                    response = self.download(self.base_url)
                    if not response:
                        # self.logger.log('{}[{}]-用药咨询{}下载失败-{}'.format(self.drug_name,self.drug_id,url[-1],url),False)
                        logger.log('{}'.format(url),False)
                        return
                response = response.content.decode('gb2312','ignore')
                with open(ask_path,'w',encoding='gb2312') as file:
                    file.write(response)
                logger.log('{}[{}]-用药咨询{}下载完成'.format(self.drug_name,self.drug_id,url[-1]))
    
    
    def transform_urls(filename):
        drug_id =  re.findall(r'.*?[(d+)]', filename)[-1]
        drug_url = 'http://ypk.39.net/{}/'.format(drug_id)
        return drug_url
    
    
    def check_downloaded(func):
        def inner(drug_urls):
            file_list = os.listdir(DOWNLOAD_DIR)
            file_list = map(transform_urls,[filename for filename in file_list])
            # print(len(list(file_list)))
            files = set(drug_urls)-set(file_list)
            # print(len(drug_urls))
            # print(len(files))
            func(list(files))
        return inner
    
    
    def get_drug_urls():
        """读取所有要抓取药品的url地址"""
        with open(file_path,'r',encoding='utf-8') as f:
            drug_urls = f.readlines()
        drug_urls = list(map(lambda x: x.strip(), list(drug_urls)))
        return drug_urls
    
    
    def get_diabetes_urls():
        return list(set(list(map(lambda x:'http://ypk.39.net/{}/'.format(x),Diabetes_ids))))
    
    
    def main(drug_base_url):
        """创建Drug类实例,进行每一种药品的抓取"""
        Drug(drug_base_url)
    
    
    def validateTitle(title):
        rstr = r"[/\:*?"<>|]"  # '/  : * ? " < > |'
        new_title = re.sub(rstr, "_", title)  # 替换为下划线
        return new_title
    
    
    def spider(url):
        url_path = urlsplit(url)
        drug_id = url_path.path.strip('/')
        try:
            response = requests.get(url=url,headers=headers,timeout=3)
            # response.encoding = response.apparent_encoding
            response = response.content.decode('gb2312','ignore')
            html = etree.HTML(response)
            drug_name = html.xpath('//div[@class="t1"]/h1/text()')[0]
            drug_name = validateTitle(drug_name)
        except Exception as e:
            print(e)
            logger.log(url,False)
            return
        drug_dir_path = os.path.join(DOWNLOAD_DIR, '{}[{}]'.format(drug_name, drug_id))
        if not os.path.exists(drug_dir_path):
            os.mkdir(drug_dir_path)
        drug_html_detail = os.path.join(drug_dir_path,'{}[{}].html'.format(drug_name,drug_id))
        if not os.path.exists(drug_html_detail):
            with open(drug_html_detail,'w',encoding='gb2312') as file:
                file.write(response)
            print(drug_name,'下载成功')
    
    
    @check_downloaded
    def run(drug_urls):
        """创建进程池"""
        print(drug_urls)
        print(len(drug_urls))
        pool = Pool(5)
        pool.map(main,drug_urls) #drug_urls[7010:12000]
        # pool.map(spider,drug_urls)
        pool.close()
        pool.join()
    
    
    if __name__ == '__main__':
        drug_urls = get_drug_urls()
        run(drug_urls)
        # urls = get_diabetes_urls()
        # run(urls)  

    具体实现:进程池,requests+lxml,打印日志,类,重新下载排除已下载页面

        2.解析所有药品页面提取有价值信息

    # -*- coding: utf-8 -*-
    
    """
    @Datetime: 2018/10/13
    @Author: Zhang Yafei
    """
    import csv
    import json
    import os
    import re
    from scrapy.selector import Selector
    import logging
    import pandas
    # import numpy as np
    
    BASE_DIRS = os.path.dirname(os.path.abspath(__file__))
    drug_path = os.path.join(BASE_DIRS,'药品')
    dirs_list = os.listdir(drug_path)
    
    analysis_file_path = os.path.join(BASE_DIRS,'drug_info.tsv')
    
    RUN_LOG_FILE = os.path.join(BASE_DIRS,'analysis_log','run.log')
    ERROR_LOG_FILE = os.path.join(BASE_DIRS,'analysis_log','error_log')
    
    
    def get_unresoved_drug_list():
        """得到未解析药品列表"""
        data = pandas.read_csv('drug_info.tsv',sep='	',encoding='utf-8')
        try:
            resoved_drug_list = data.apply(lambda row:'{}[{}]'.format(row['药品名称'],row['药品ID']),axis=1).tolist()
        except AttributeError as e:
            resoved_drug_list = []
            for index,row in data.iterrows():
                drug_name = '{}[{}]'.format(row['药品名称'],row['药品ID'])
                resoved_drug_list.append(drug_name)
        unresoved_drug_list = list(set(dirs_list) - set(resoved_drug_list))
        return unresoved_drug_list
    
        #1.index方式
        # resoved_drug_list = []
        # for row in data.index:
        #     drug_name = '{}[{}]'.format(data.iloc[row]['药品名称'],data.iloc[row]['药品ID'])
        #     resoved_drug_list.append(drug_name)
        #2.iterrows方式
        # for index,row in data.iterrows():
        #     drug_name = '{}[{}]'.format(row['药品名称'],row['药品ID'])
        #     resoved_drus_list.append(drug_name)
        # print(dirs_list.__len__(),resoved_drug_list.__len__(),unresoved_drug_list.__len__())
    
    
    def write_resoved_drag_list(drag):
        """将解析完成的药品写入文件"""
        if not os.path.exists('resolved_drag_list.py'):
            resoved_drag_list = set()
            resoved_drag_list.add(drag)
        else:
            with open('resolved_drag_list.py', 'r', encoding='utf-8') as f:
                resoved_drag_list = set(json.load(f))
            resoved_drag_list.add(drag)
        with open('resolved_drag_list.py','w',encoding='utf-8') as f:
            json.dump(list(resoved_drag_list),f)
    
    
    def write_error_drag_list(drag):
        """将错误drug写入文件"""
        if not os.path.exists('error_drag_list.py'):
            error_drag_list = set()
            error_drag_list.add(drag)
        else:
            with open('error_drag_list.py', 'r', encoding='utf-8') as f:
                error_drag_list = set(json.load(f))
            error_drag_list.add(drag)
        with open('error_drag_list.py','w',encoding='utf-8') as f:
            json.dump(list(error_drag_list),f)
    
    
    class Logger(object):
        def __init__(self):
            self.run_log_file = RUN_LOG_FILE
            self.error_log_file = ERROR_LOG_FILE
            self.run_log = None
            self.error_log = None
    
            self.initialize_run_log()
            self.initialize_error_log()
    
        @staticmethod
        def check_path_exist(log_abs_file):
            log_path = os.path.split(log_abs_file)[0]
            if not os.path.exists(log_path):
                os.mkdir(log_path)
    
        def initialize_run_log(self):
            self.check_path_exist(self.run_log_file)
            fh = logging.FileHandler(self.run_log_file, 'a', encoding='utf-8')
            sh = logging.StreamHandler()
            fmt = logging.Formatter(fmt="%(asctime)s - %(levelname)s :  %(message)s")
            # fh.setFormatter(fmt)
            sh.setFormatter(fmt)
            logger1 = logging.Logger('run_log', level=logging.INFO)
            logger1.addHandler(fh)
            logger1.addHandler(sh)
            self.run_logger = logger1
    
        def initialize_error_log(self):
            self.check_path_exist(self.error_log_file)
            fh = logging.FileHandler(self.error_log_file, 'a', encoding='utf-8')
            sh = logging.StreamHandler()
            fmt = logging.Formatter(fmt="%(asctime)s  - %(levelname)s :  %(message)s")
            # fh.setFormatter(fmt)
            sh.setFormatter(fmt)
            logger1 = logging.Logger('error_log', level=logging.ERROR)
            logger1.addHandler(fh)
            logger1.addHandler(sh)
            self.error_logger = logger1
    
        def log(self, message, mode=True):
            """
            写入日志
            :param message: 日志信息
            :param mode: True表示运行信息,False表示错误信息
            :return:
            """
            if mode:
                self.run_logger.info(message)
            else:
                self.error_logger.error(message)
    
    
    class DrugInfo(object):
        """
        提取的药品信息:
            self.drug_name                      #药品名称
            self.category                       #药品类型
            self.cite                           #国家标准
            self.company                        #生产厂家
            self.address                        #厂家地址
            self.license_number                 #批准文号
            self.approval_date                  #批准日期
            self.form_drug                      #剂型
            self.spec                           #规格
            self.store                          #储存方法
            self.period_valid                   #有效期限
            self.attention_rank                 #关注度排名
            self.indication                     #适应症
            self.component                      #成分
            self.function                       #功能主治
            self.usage_dosage                   #用法用量
            self.contraindication               #禁忌症
            self.special_population             #特殊人群用药
            self.indications                    #适应症概况
            self.is_or_not_medical_insurance    #是否属于医保
            self.is_or_not_infections           #是否有传染性
            self.related_symptoms               #相关症状
            self.related_examination            #相关检查
            self.adverse_reaction               #不良反应
            self.attention_matters              #注意事项
            self.interaction                    #药物相互作用
            self.pharmacological_action         #药理作用
            self.revision_date                  #说明书修订日期
            self.drug_use_consult               #用药咨询
            self.drug_use_experience            #用药经验
    
        """
        def __init__(self,drug):
            drug_dir = os.path.join(drug_path, drug)
            self.drug_name = re.findall('(.*?)[d+]',drug)[0]
            self.drug_id = re.findall('.*?[(d+)].*',drug)[0]
            self.drug_dir = drug_dir
            self.drug_use_experience = ''
            self.drug_use_consult = ''
            self.file_list = os.listdir(self.drug_dir)
    
            self.logger = Logger()
    
            self.result = True
    
            self.dispatch()
            if self.drug_use_consult.__len__()==0:self.drug_use_consult = '无'
            if self.drug_use_experience.__len__()==0:self.drug_use_experience = '无'
    
        def dispatch(self):
            for file in self.file_list:
                if file.endswith('药品概述.html'):
                    self.drug_summary(self.file_path(file))
                elif file.endswith('详细说明书.html'):
                    self.drug_instruction(self.file_path(file))
                elif re.match('.*?用药咨询.*',file):
                    self.drug_consultation(self.file_path(file))
                elif re.match('.*?用药经验.*',file):
                    self.drug_experience(self.file_path(file))
                else:
                    self.result = False
                    break
    
        def file_path(self,file):
            return os.path.join(self.drug_dir,file)
    
        def read_file(self,file):
            with open(file,'r') as f:
                html = f.read()
            return html
    
        def drug_summary(self,file):
            """药品概况"""
            html = self.read_file(file)
            selector = Selector(text=html)
            self.category = selector.xpath('//div[@class="t1"]/cite[1]/span/text()').extract_first()    #药品类型
            if not self.category:
                self.category = '未知'
            self.cite = selector.xpath('//div[@class="t1"]/cite[2]/span/text()').extract_first()    #国家标准
            if not self.cite:
                self.cite = '未知'
            try:
                self.company = selector.css('.t3 .company a::text').extract()[0]    #生产厂家
            except IndexError as e:
                self.company = '未知'
            try:
                self.address = selector.css('.t3 .address::text').extract()[0]  #厂家地址
            except IndexError as e:
                self.address = '未知'
            try:
                self.license_number = selector.xpath('//ul[@class="xxs"]/li[1]/text()').extract_first().strip() #批准文号
            except AttributeError:
                self.license_number = '未知'
            try:
                self.approval_date = selector.xpath('//ul[@class="xxs"]/li[2]/text()').extract_first().strip()  #批准日期
            except AttributeError:
                self.approval_date = '未知'
            try:
                self.form_drug = selector.xpath('//ul[@class="showlis"]/li[1]/text()').extract_first().strip()  #剂型
            except AttributeError:
                self.form_drug = '未知'
            try:
                self.spec = selector.xpath('//ul[@class="showlis"]/li[2]/text()').extract_first().strip()       #规格
            except AttributeError:
                self.spec = '未知'
            try:
                self.store = selector.xpath('//ul[@class="showlis"]/li[3]/text()').extract_first().strip().strip('。')     #储存方法
            except AttributeError:
                self.store = '未知'
            try:
                self.period_valid = selector.xpath('//ul[@class="showlis"]/li[4]/text()').extract_first().strip('。').replace('
    ','')   #有效期限
            except AttributeError:
                self.period_valid = '未知'
            self.attention_rank = selector.css('.guanzhu cite font::text').extract_first()  #关注度排名
            if not self.attention_rank:
                self.attention_rank = '未知'
            self.indication = ','.join(selector.css('.whatsthis li::text').extract())   #适应症
            if self.indication == '':
                self.indication = '未知'
            usage_dosage = selector.css('.ps p:nth-child(3)::text').extract_first()   #用法用量
            if usage_dosage:
                self.usage_dosage = re.sub('<.*?>','',usage_dosage).strip().replace('
    ','')  #禁忌症
            else:
                self.usage_dosage = '未知'
            indications = selector.css('#diseaseintro::text').extract_first()  #适应症概况
            if indications:
                self.indications = re.sub('<.*?>','',indications).strip().replace('
    ','')  #禁忌症
            else:
                self.indications = '未知'
            try:
                self.is_or_not_medical_insurance = selector.css('.syz_cons p:nth-child(2)::text').extract_first().split(':')[1] #是否属于医保
            except AttributeError as e:
                self.is_or_not_medical_insurance = '未知'
            try:
                self.is_or_not_infections = selector.css('.syz_cons p:nth-child(3)::text').extract_first().split(':')[1].strip()  #是否有传染性
            except AttributeError as e:
                self.is_or_not_infections = '未知'
            self.related_symptoms = ','.join(selector.css('.syz_cons p:nth-child(4) a::text').extract()[:-1])      #相关症状
            if len(self.related_symptoms) == 0:
                self.related_symptoms = '未知'
            self.related_examination = ','.join(selector.css('.syz_cons p:nth-child(5) a::text').extract()[:-1])    #相关检查
            if len(self.related_examination) == 0:
                self.related_examination = '未知'
    
        def drug_instruction(self,file):
            """详细说明书"""
            html = self.read_file(file)
            selector = Selector(text=html)
            #注:不同药品之间网页结构有差别,提取的时候应注意
            component = selector.xpath('//dt[text()="【成份】"]/following::*[1]').extract_first()
            if not component:
                self.component = '未知'
            else:
                self.component = re.sub('<.*?>','',component).strip()       #成分
            contraindication= selector.xpath('//dt[text()="【禁忌】"]/following::*[1]').extract_first()
            if contraindication:
                self.contraindication = re.sub('<.*?>','',contraindication).strip().replace('
    ','')  #禁忌症
            else:
                self.contraindication = '未知'
            function = selector.xpath('//dt[text()="【功能主治】"]/following::*[1]').extract_first()
            if function:
                self.function = re.sub('<.*?>','',function).strip()         #功能主治
            else:
                self.function = '未知'
    
            try:
                self.adverse_reaction = selector.xpath('//dt[text()="【不良反应】"]/following::*[1]/p/text()').extract_first().strip('。')  #不良反应
            except AttributeError as e:
                try:
                    self.adverse_reaction = selector.xpath('//dt[text()="【不良反应】"]/following::*[1]/text()').extract_first().strip('。')  #不良反应
                    self.adverse_reaction = re.sub('<.*?>','',self.adverse_reaction).strip().replace('
    ','')  #注意事项
                except AttributeError:
                    self.adverse_reaction = '未知'
            attention_matters = selector.xpath('//dt[text()="【注意事项】"]/following::*[1]').extract_first()
            if attention_matters:
                self.attention_matters = re.sub('<.*?>','',attention_matters).strip().replace('
    ','')  #注意事项
            else:
                self.attention_matters = '未知'
                self.logger.log('{}[{}]-注意事项为空'.format(self.drug_name,self.drug_id),False)
            try:
                self.interaction = selector.xpath('//dt[text()="【药物相互作用】"]/following::*[1]/p/text()').extract_first()  #药物相互作用
                self.interaction = re.sub('<.*?>','',self.interaction).strip().replace('
    ','')  #注意事项
            except TypeError:
                self.interaction = '未知'
            try:
                self.pharmacological_action = selector.xpath('//dt[text()="【药理作用】"]/following::*[1]/p/text()').extract_first()  #药理作用
                self.pharmacological_action = re.sub('<.*?>','',self.pharmacological_action).strip().replace('
    ','')
            except TypeError:
                self.pharmacological_action = '未知'
            try:
                self.revision_date = selector.xpath('//dt[text()="【说明书修订日期】"]/following::*[1]/text()').extract_first().strip()  #说明书修订日期
            except AttributeError:
                self.revision_date = '未知'
            try:
                self.special_population = selector.xpath('//dt[text()="【特殊人群用药】"]/following::*[1]/text()').extract_first()  #特殊人群用药
                self.special_population = re.sub('<.*?>','',self.special_population).strip().replace('
    ','')  #特殊人群用药
            except TypeError:
                self.special_population = '未知'
    
        def drug_consultation(self,file):
            """用药咨询"""
            html = self.read_file(file)
            selector = Selector(text=html)
            drug_use_consult = selector.css('.dpzx_con .zx p::text').extract()
            drug_use_consult = ''.join(drug_use_consult)
            drug_use_consult = re.sub('<.*?>','',drug_use_consult).strip().replace('
    ','')  #用药咨询
            self.drug_use_consult += drug_use_consult
    
        def drug_experience(self,file):
            """用药经验"""
            html = self.read_file(file)
            selector = Selector(text=html)
            drug_use_experience = selector.css('.pls_box .pls_mid p::text').extract()
            drug_use_experience = ''.join(drug_use_experience)
            drug_use_experience = re.sub('<.*?>','',drug_use_experience).strip().replace('
    ','')  #用药经验
            self.drug_use_experience += drug_use_experience.strip()
    
        @staticmethod
        def write_to_fileheader():
            with open('drug_info.tsv','w',newline='',encoding='utf-8') as MyFile:
                writer = csv.writer(MyFile,dialect='excel-tab')
                drug_header = ['药品名称','药品ID','药品类型','国家标准','生产厂家','厂家地址','批准文号','批准日期','剂型','规格','储存方法','有效期限','关注度排名','适应症','成分','功能主治','用发用量','禁忌症','特殊人群用药','适应症概况','是否用于医保','是否具有传染性','相关症状','相关检查','不良反应','注意事项','药物相互作用','药理作用','说明书修订日期','用药经验','用药咨询']
                writer.writerow(drug_header)
    
        def write_to_file(self):
            with open('drug_info.tsv', 'a', newline='', encoding='utf-8') as MyFile:
                writer = csv.writer(MyFile, dialect='excel-tab')
                druginfo_list = [self.drug_name,self.drug_id,self.category,self.cite,self.company,self.address,self.license_number,self.approval_date,
                                     self.form_drug,self.spec,self.store,self.period_valid,self.attention_rank,
                                     self.indication,self.component,self.function,self.usage_dosage,self.contraindication,
                                     self.special_population,self.indications,self.is_or_not_medical_insurance,self.is_or_not_infections,
                                     self.related_symptoms,self.related_examination,self.adverse_reaction,self.attention_matters,
                                     self.interaction,self.pharmacological_action,self.revision_date,self.drug_use_experience,
                                     self.drug_use_consult,
                                     ]
                writer.writerow(druginfo_list)
            self.logger.log('{}[{}]信息写入文件完毕'.format(self.drug_name,self.drug_id))
    
    
    def main(drug):
        """主函数"""
        druginfo = DrugInfo(drug)
        # print(druginfo.drug_name,druginfo.drug_id)
        # print(druginfo.drug_use_experience)
        # print(druginfo.drug_use_consult)
        if druginfo.result:
            druginfo.write_to_file()
            write_resoved_drag_list(drug)
        else:
            druginfo.logger.log('{}[{}]'.format(druginfo.drug_name,druginfo.drug_id),False)
            write_error_drag_list(drug)
    
    
    def new_data(row):
        """增加别名列"""
        # print(row['药品名称'],row['别名'])
        drug_name = row['药品名称']
        try:
            row['别名'] = drug_name.rsplit('(',1)[1].strip(')')
            row['药品名称'] = drug_name.rsplit('(',1)[0]
        except IndexError as e:
            row['别名'] = np.NAN
        return row
    
    
    def update_drug_name():
        """更新药品文件信息"""
        data = pandas.read_csv('drug_info.tsv',sep='	',encoding='utf-8')
        col_name = data.columns.tolist()
        # print(dir(col_name))
        col_name.insert(col_name.index('药品名称')+1,'别名')
        # col_name.insert(1,'别名')
        data = data.reindex(columns=col_name)
        new_drug = data.apply(new_data,axis=1)
        new_drug.to_csv('new_drug_info.tsv',index=False,sep='	',encoding='utf-8')
        print('文件保存成功')
        # print(new_drug[['药品名称','别名']])
    
        # for row in data[10:13].iterrows():
            # drug_name = row['药品名称'].values
            # drug_alias = drug_name.rsplit('(',1)[1].strip(')')
            # print(drug_name)
            # print(drug_alias)
    # print(data.tail(10).index)
    # print(data.iloc[:2,1:8])
    # print(data.iloc[1]['注意事项'].replace('
    ',''))
    # print(data.iloc[2]['注意事项'].replace('
    ',''))
    # print(data.__len__())
    # resoved_drus_list = data.apply(lambda row:'{}[{}]'.format(row['药品名称'],row['药品ID']),axis=1).tolist()
    # print(resoved_drus_list.__len__())
    # unresoved_drug_list = set(dirs_list) - set(resoved_drus_list)
    # print(unresoved_drug_list.__len__())
    
    
    if __name__ == '__main__':
        if not os.path.exists(analysis_file_path):
            DrugInfo.write_to_fileheader()
        drug_list = get_unresoved_drug_list()
        print(drug_list.__len__())
        list(map(main,drug_list))
        # with open('error_drag_list.py','r',encoding='utf-8') as f:
        #     data = json.load(f)
        # print(data)
    #    update_drug_name()
        # newdata = pandas.read_csv('new_drug_info.tsv',sep='	',encoding='utf-8')
        # print(newdata.head())
    

     3.下载糖尿病相关药品页面

    # -*- coding: utf-8 -*-
    
    """
    @Datetime: 2018/11/10
    @Author: Zhang Yafei
    """
    import json
    
    import requests
    from scrapy.selector import Selector
    from lxml import etree
    from multiprocessing import Pool
    
    ids_list = []
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'}
    
    
    def spider(url):
        response = requests.get(url,headers=headers)
        # selector = Selector(response=response)
        html = etree.HTML(response.text)
        # ids = selector.css('.search_ul li a:nth-child(1)::attr(href)').extract()
        ids = html.xpath('//ul[@class="search_ul search_ul_yb"]/li/a/@href')
        ids = list(map(lambda x:x.strip('/'),ids))
        ids_list.extend(ids)
    
    
    if __name__ == '__main__':
        urls = ['http://ypk.39.net/tangniaobing/p{}'.format(i) for i in range(1,135)]
        pool = Pool(4)
        pool.map(spider,urls)
        list(map(spider,urls))
        with open('ids.py','w',encoding='utf-8') as f:
            json.dump(ids_list,f)
    

      4.更新药品信息,拆分药名列分为药品名称列和别名列

    def new_data(row):
        """增加别名列"""
        drug_name = row['药品名称']
        try:
            row['别名'] = drug_name.rsplit('(',1)[1].strip(')')
            row['药品名称'] = drug_name.rsplit('(',1)[0]
        except IndexError as e:
            row['别名'] = np.NAN
        return row
    
    def update_drug_name():
        """更新药品文件信息"""
        data = pandas.read_csv('drug_info.tsv',sep='	',encoding='utf-8')
        col_name = data.columns.tolist()
        # print(dir(col_name))
        col_name.insert(col_name.index('药品名称')+1,'别名')
        # col_name.insert(1,'别名')
        data = data.reindex(columns=col_name)
        new_drug = data.apply(new_data,axis=1)
        new_drug.to_csv('new_drug_info.tsv',index=False,sep='	',encoding='utf-8')
        print('文件保存成功')
    
    
    if __name__ == '__main__':
        update_drug_name()
    

      5.抓取所有药品评论数,并构建药品评论数字典

    # -*- coding: utf-8 -*-
    
    """
    @Datetime: 2018/11/10
    @Author: Zhang Yafei
    """
    import pandas
    import os
    import re
    # import jieba
    from multiprocessing.pool import Pool
    from scrapy import Selector
    import json
    import numpy
    import time
    import csv
    
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    drug_path = os.path.join(BASE_DIR, '药品')
    dirs_list = os.listdir(drug_path)
    result = {}
    k_list = []
    v_list = []
    class_list = []
    
    # comment_data = pandas.read_csv('comment_num_grade.csv',encoding='utf-8')
    # data = list(comment_data.药品名称.values)
    
    # comment_data['类别'] = ''
    count = 0
    
    
    class DrugInfo(object):
        """构造药品评论数字典"""
    
        def __init__(self, drug):
            self.drug = drug
            drug_dir = os.path.join(drug_path, drug)
            self.drug_name = re.findall('(.*?)[d+]', drug)[0]
            self.drug_id = re.findall('.*?[(d+)].*', drug)[0]
            self.drug_dir = drug_dir
            self.file_list = os.listdir(self.drug_dir)
    
            self.dispatch()
    
        def dispatch(self):
            for file in self.file_list:
                # if file.endswith('药品概述.html'):
                #     self.drug_summary(self.file_path(file))
                # if re.match('.*?用药咨询.*',file):
                #     self.drug_consultation(self.file_path(file))
                if re.match('.*?用药经验.*', file):
                    self.drug_experience(self.file_path(file))
    
        def file_path(self, file):
            return os.path.join(self.drug_dir, file)
    
        def read_file(self, file):
            with open(file, 'r') as f:
                html = f.read()
            return html
    
        def drug_summary(self, file):
            """药品概况"""
            html = self.read_file(file)
            selector = Selector(text=html)
            category = selector.xpath('//div[@class="subs"]/p/a[last()]/text()').extract_first()
            print(category)
            #        class_list.append(category)
            index = comment_data.loc[comment_data.药品名称 == self.drug, '类别'].index.values[0]
            comment_data.loc[index, '类别'] = category
    
        def drug_experience(self, file):
            # print(file)
            """用药经验"""
            html = self.read_file(file)
            selector = Selector(text=html)
            drug_use_experience_num = selector.css('.dps cite font::text').extract_first()
            if not drug_use_experience_num:
                self.drug_use_experience_num = 0
            else:
                self.drug_use_experience_num = int(drug_use_experience_num)
            result[self.drug] = self.drug_use_experience_num
            print(self.drug,self.drug_use_experience_num)
    
    
    def write_to_file(self):
        try:
            with open('comment_num_grade.csv', 'a', newline='', encoding='utf_8_sig') as MyFile:
                writer = csv.writer(MyFile)
                druginfo_list = [self.drug,self.drug_use_experience_num]
                writer.writerow(druginfo_list)
            print('{}写入文件完毕'.format(self.drug))
        except AttributeError:
            return
    
    
    def write_num():
        with open('comment.py', 'w', encoding='utf-8') as f:
            json.dump(result, f)
            #    for k,v in result.items():
            #        k_list.append(k)
            #        v_list.append(v)
        data = {'药品名称': list(result.keys()), '评论数': list(result.values())}
        df = pandas.DataFrame(data)
        comment_data = df.sort_values(by='评论数', ascending=False)
        comment_data.to_csv('comment_num_grade.csv', sep=',', encoding='utf_8_sig', mode='w', index=False)
        return comment_data
    
    
    def read_num():
        with open('comment.py', 'r', encoding='utf-8') as f:
            num = json.load(f)
        for k, v in num.items():
            k_list.append(k)
            v_list.append(v)
        data = {'药品名称': k_list, '评论数': v_list}
        df = pandas.DataFrame(data)
        comment_data = df.sort_values(by='评论数', ascending=False)
        comment_data.to_csv('comment_num_grade.csv', sep=',', encoding='utf_8_sig', mode='w', index=False)
        return comment_data
    
    
    def main(drug):
        """主函数"""
        DrugInfo(drug)
        # try:
        #     result[d.drug] = d.drug_use_experience_num
        # except:
        #     result[d.drug] = 0
        # write_to_file(d)
    
    
    if __name__ == '__main__':
        start = time.time()
        # pool = Pool(4)
        # pool.map(main,dirs_list)
        # pool.close()
        # pool.join()
        list(map(main,dirs_list))
        write_num()
        # comment_data.to_csv('new_comment_num_grade.csv',encoding='utf_8_sig',mode='w',index=False)
        print('总花费:{}秒'.format(time.time() - start))
    #     comment_data = read_num()
    #    print(comment_data)
    #     print(len(num))
    

      6.提取评论数量最多的前10个药品评论信息

    # -*- coding: utf-8 -*-
    
    """
    @Datetime: 2018/11/10
    @Author: Zhang Yafei
    """
    import csv
    
    import numpy
    import pandas
    import os
    import re
    import jieba
    from scrapy import Selector
    import re
    
    BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    drug_path = os.path.join(BASE_DIR,'药品')
    dirs_list = os.listdir(drug_path)
    
    comment_info_filename = 'first50_comment.csv'
    
    class DrugInfo(object):
        """
        提取的用户评论信息:
        """
        def __init__(self,drug):
            drug_dir = os.path.join(drug_path, drug)
            self.drug_name = re.findall('(.*?)[d+]',drug)[0]
            self.drug_id = re.findall('.*?[(d+)].*',drug)[0]
            self.drug_dir = drug_dir
            self.drug_use_experience = ''
            self.file_list = os.listdir(self.drug_dir)
            self.result = True
    
            self.dispatch()
    
        def dispatch(self):
            for file in self.file_list:
                # if re.match('.*?用药咨询.*',file):
                #     self.drug_consultation(self.file_path(file))
                if re.match('.*?用药经验.*',file):
                    self.drug_experience(self.file_path(file))
    
        def file_path(self,file):
            return os.path.join(self.drug_dir,file)
    
        def read_file(self,file):
            with open(file,'r') as f:
                html = f.read()
            return html
    
        def drug_experience(self,file):
            print(file)
            """用药经验"""
            html = self.read_file(file)
            selector = Selector(text=html)
            drug_use_experience = selector.css('.pls_box')
            try:
                page = selector.css('.dpzx .pages .pgleft span::text').extract()[0]
            except IndexError:
                page = 1
            drug_url = 'http://ypk.39.net/{}/comment/k0_p{}'.format(self.drug_id,page)
            if not drug_use_experience:
                self.write_to_file(numpy.NAN,numpy.NAN,numpy.NAN,drug_url)
                return
            for drug in drug_use_experience:
                self.drug_use_experience = drug.css('.pls_mid p::text').extract()[0].replace(' ','').strip('
    ')
                commter_info = drug.css('.pls_top cite::text').extract()[0].replace('
    ','').strip('来自').strip(' ').replace('  ','/').rstrip('/')
                cut_info = '/'.join(list(jieba.cut(self.drug_use_experience)))
                cut_info = cut_info.strip('/	/')
                time = drug.css('.pls_top i::text').extract()[0].strip().strip('点评时间:')
                if not time:
                    time = numpy.NAN
                self.write_to_file(commter_info,cut_info,time,drug_url)
    
        def write_to_file(self,commter_info,cut_info,time,drug_url):
            with open(comment_info_filename, 'a', newline='', encoding='utf_8_sig') as MyFile:
                writer = csv.writer(MyFile)
                druginfo_list = [self.drug_name,self.drug_id,commter_info,self.drug_use_experience,cut_info,time,drug_url]
                writer.writerow(druginfo_list)
            print('{}写入文件完毕'.format(drug_url))
    
    
    def write_to_fileheader():
        with open(comment_info_filename,'w',newline='',encoding='utf_8_sig') as MyFile:
            writer = csv.writer(MyFile)
            drug_header = ['药品名称','药品ID','评论者信息','评论','分词','评论时间','url']
            writer.writerow(drug_header)
    
    
    def main(drug):
        """主函数"""
        DrugInfo(drug)
        print('解析完成')
    
    
    def read_comment_num_first50():
        """
        读取前评论数前10多药品
        :return: 评论数前10多药品名称的列表
        """
        data = pandas.read_csv('concat_first50_comment.csv',encoding='utf-8')
    
        drugs = data.药品名称.values.tolist()
        drugs_id = list(map(lambda x:re.findall('d+',x)[-1],drugs))
        df = pandas.DataFrame({'drug_name':dirs_list})
        drugs = list(map(lambda x:df[df.drug_name.str.contains(x)].drug_name.values,drugs_id))
        drugs = list(filter(lambda x:x.__len__(),drugs))
        return [x[0] for x in drugs]
    
    if __name__ == '__main__':
        if not os.path.exists(os.path.join(BASE_DIR,comment_info_filename)):
            write_to_fileheader()
        drugs = read_comment_num_first50() 
        print(drugs.__len__())
        list(map(main,drugs))
        print(drugs.__len__())
    

      7.分析药品评论数量所占比例

    # -*- coding: utf-8 -*-
    """
    Created on Mon Nov 12 19:28:09 2018
    
    @author: Zhang Yafei
    """
    import json
    import os
    
    import pandas
    #from wordcloud import WordCloud as wc
    #from pyecharts import WordCloud
    import matplotlib as mpl
    from matplotlib import pyplot as plt
    #import wordcloud
    import numpy as np
    from PIL import Image
    
    data = pandas.read_csv('new_comment_num_grade.csv',encoding='utf-8')
    
    drug_type_num = data.类别.value_counts()
    
    drug_type_names = data.类别.value_counts().index.values
    
    drug_type_dict = {}
    
    
    def parse(drug_type_name):
        drug_type_frequence = data[data['类别']==drug_type_name].评论数.sum()
        drug_type_dict[drug_type_name] = int(drug_type_frequence)
    
    
    def plot_wordcloud(drug_dict=None):
        if drug_dict:
            label = drug_dict.keys()
            attr = drug_dict.values()
        else:
            label = drug_type_dict.keys()
            attr = drug_type_dict.values()
        wordcloud = WordCloud(width=800, height=620)
        wordcloud.add('', label, attr, word_size_range=[20, 100])
        wordcloud.render('drug_comment_wordcloud.html')
    
    
    def plot_wc(drug_dict=None):
        mask = np.array(Image.open('mask1.jpg'))
        word_plot = wc(
            font_path='font/simsun.ttc',  # 设置字体格式
            mask=mask,  # 设置背景图
            max_words=200,  # 最多显示词数
            max_font_size=100  # 字体最大值
        )
        if drug_dict:
            word_plot = word_plot.generate_from_frequencies(drug_dict) # 从字典生成词云
        else:
            word_plot = word_plot.generate_from_frequencies(drug_type_dict) # 从字典生成词云
        image_colors = wordcloud.ImageColorGenerator(mask)  # 从背景图建立颜色方案
        word_plot.recolor(color_func=image_colors)  # 将词云颜色设置为背景图方案
        plt.imshow(word_plot)  # 显示词云
        plt.axis('off')  # 关闭坐标轴
        plt.show()  # 显示图像
        word_plot.to_file('comment_num.jpg')
    
    
    def plot_series_pie():
        mpl.rcParams['font.sans-serif'] = ['SimHei']
        pie_data = pandas.read_csv('drug_type_num_sum.csv',encoding='utf-8')
        numbers = np.array(pie_data[pie_data.评论总数>0].评论总数)
        drug_type = pie_data[pie_data.评论总数>0].类别名称
        series = pandas.Series(numbers,index=drug_type,name='药物类型评论数饼状图')
        series.plot.pie(figsize=(8,8),autopct='%.2f')
    
    
    def plot_mpl_pie():
        font = {
            'family': 'SimHei'
        }
        mpl.rc('font', **font)
        pie_data = pandas.read_csv('drug_type_num_sum.csv', encoding='utf-8')
        numbers = np.array(pie_data[pie_data.评论总数 > 0].评论总数)
        drug_type = pie_data.类别名称
        plt.pie(numbers, labels=drug_type, autopct='%.2f%%',
                shadow=True, labeldistance=1.1, startangle=90, pctdistance=0.6)
        plt.title('药物类型评论数饼状图')
        plt.savefig('药物类别与评论数量饼状图(mpl).png')
        plt.show()
    
    
    def type_drug_num_pie():
        font = {
            'family': 'SimHei'
        }
        mpl.rc('font', **font)
        pie_data = pandas.read_csv('drug_type_num_sum.csv', encoding='utf-8')
        numbers = np.array(pie_data.药品数量)
        drug_type = pie_data.类别名称
        plt.pie(numbers, labels=drug_type, autopct='%.2f%%',
                shadow=True, labeldistance=1.1, startangle=90, pctdistance=0.6)
        plt.title('药物类型药品数量数饼状图')
        plt.savefig('药物类别与药品数量饼状图(mpl).png')
        plt.show()
    
    
    def wirte_to_file():
        with open('comment_num_dict.py','w',encoding='utf-8') as f:
            json.dump(drug_type_dict,f)
    
    
    def read_from_file():
        with open('comment_num_dict.py','r',encoding='utf-8') as f:
            drug_type_dict = json.load(f)
    
        return drug_type_dict
    
    
    def write_type_num_to_file():
        drug_type_dict = read_from_file()
        type_name = list(drug_type_dict.keys())
        type_num = list(drug_type_dict.values())
        drug_type_nums = data.类别.value_counts().values
        df_data = {'类别名称':type_name,'药品数量':drug_type_nums,'评论总数':type_num,}
        df = pandas.DataFrame(df_data)
        df.to_csv('drug_type_num_sum.csv',mode='w',encoding='utf_8_sig',index=False)
    
    
    def write_new_file():
        new_data = pandas.read_csv('drug_type_num_sum.csv', encoding='utf-8')
        new_data['药品数量所占比例'] = round(new_data.药品数量/new_data.药品数量.sum(),4)
        new_data['评论数量所占比例'] = round(new_data.评论总数/new_data.评论总数.sum(),4)
        new_data.to_csv('drug_type_num_sum.csv',mode='w',encoding='utf_8_sig',index=False)
    
    
    def main():
        if os.path.exists('comment_num_dict.py'):
            drug_dict = read_from_file()
            # plot_wordcloud(drug_dict)
            plot_wc(drug_dict)
        else:
            list(map(parse,drug_type_names))
            wirte_to_file()
            # plot_wordcloud()
            plot_wc()
    
    
    if __name__ == '__main__':
        # 1.计算每人评论数量所占比例,并生成词云
        # main()
        # write_type_num_to_file()
        # 2.画饼状图
        # plot_series_pie()
        # plot_mpl_pie()
        # type_drug_num_pie()
        # write_new_file()
    

      8.前50药品数据合并

    # -*- coding: utf-8 -*-
    """
    Created on Mon Dec  3 20:50:12 2018
    
    @author: Zhang Yafei
    """
    
    import pandas as pd
    import matplotlib.pyplot as plt
    import matplotlib
    
    data1 = pd.read_csv('comment_num_grade_zhangyafei.csv',encoding='utf-8')
    data2 = pd.read_csv('comment_num_grade_wangyuxin.csv',encoding='utf-8')
    data3 = pd.read_csv('comment_num_grade_liangwenqi.csv',encoding='utf-8')
    data4 = pd.read_csv('comment_num_grade_zhangxinrui.csv',encoding='utf-8')
    data5 = pd.read_table('macaizhen.txt',encoding='utf-8',header=None,names=['药品名称','评论数'])
    data6 = pd.read_csv('comment_num_grade_wangshuai.csv',encoding='utf-8')
    data7 = pd.read_csv('comment_num_grade_wangqi.csv',encoding='utf-8')
    data8 = pd.read_csv('tangao.txt',encoding='utf-8',delimiter='	',header=None,names=['药品名称','评论数'])
    
    data1['who'] = '张亚飞'
    data2['who'] = '王于心'
    data3['who'] = '梁雯琪'
    data4['who'] = '张昕瑞'
    data5['who'] = '马彩珍'
    data6['who'] = '王帅'
    data7['who'] = '王琪'
    data8['who'] = '唐奥'
    
    data_concat = pd.concat([data1,data2,data3,data4,data5,data6,data7,data8],ignore_index=True,sort=True).sort_values('评论数',ascending=False).reset_index().drop('index',axis=1)[:50]
    
    print(data_concat)
    data_concat.who.value_counts()
    data_concat.评论数.sum()
    groupby_data = data_concat.groupby(by='who')['评论数'].agg(np.sum)
    
    
    data9 = pd.read_csv('first50_comment_zhangyafei.csv',encoding='utf-8')
    data10 = pd.read_csv('first50_comment_zhangxinrui.csv',encoding='utf-8')
    data11 = pd.read_csv('first50_comment_wangqi.csv',encoding='utf-8')
    data12 = pd.read_csv('first50_comment_tangao.csv',encoding='utf-8')
    data13 = pd.read_csv('first50_comment_wangshuai.csv',encoding='utf-8')
    data14 = pd.read_csv('first50_comment_wangyuxin.csv',encoding='utf-8')
    data15 = pd.read_csv('first50_comment_liangwenqi.csv',encoding='utf-8')
    data16 = pd.read_csv('first50_comment_macaizhen.csv',encoding='utf-8')
    
    data_concat2 = pd.concat([data9,data10,data11,data12,data13,data14,data15,data16],ignore_index=True)
    
    def plot_hist():
        """画出评论数量分布直方图"""
        font = {'family' : 'SimHei'}
        matplotlib.rc('font', **font)
        plt.figure(figsize=(15,8),dpi=80)
    #    x = data_concat.评论数.values
        x = data_concat2.药品ID.value_counts().values
        # num_bins 分组数
        num_bins = int((max(x)-min(x))//10)
        plt.hist(x,num_bins,facecolor='blue')
        plt.xticks(range(int(min(x)),int(max(x))+10,10))
        plt.grid(alpha=0.5)
        plt.title('评论总数前50名药品数量分布状况')
        plt.xlabel('评论数量')
        plt.ylabel('分布情况')
        plt.savefig('评论总数前50名药品数量分布状况1.png')
        plt.show()
        
        
    def plot_bar():
        """画出每个人的评论数量对比条形图"""
        font = {'family':'SimHei'}
        matplotlib.rc('font', **font)
        plt.figure(figsize=(11,6),dpi=80)
        plt.bar(groupby_data.index,groupby_data.values)
        plt.xlabel('姓名')
        plt.ylabel('评论数')
        plt.title('评论数量前50名个人所占评论总数对比')
        plt.savefig('评论数量前50名个人所占评论总数对比.png')
        plt.show()
    
    #plot_bar()    
    ##
    #plot_hist()
    
    #    df = pd.DataFrame(np.arange(24).reshape(6,4),columns=['A','B','C','D'])
    ##    df[2] = 1  
    #    df    
    #    df[:1]    
    def label_recognition(df):
        """标注识别"""     
    #    label1 = df[df.apply(lambda x:x.分词 == x.分词2,axis=1)]   
        label1 = df[df.分词 == df.分词2]
        label2 = df[(df.分词 == df.分词2) & (df.分词 == df.分词3)] 
        return label1, label2
        
        
    if __name__ == '__main__':
    #    data_concat.to_csv('concat_first50_comment.csv',encoding='utf_8_sig',index=False)
    #    data_concat2.to_csv('first50_comment.csv',encoding='utf_8_sig',index=False)
        label1 = pd.read_excel(io='first50_comment_zhangxinrui2.xlsx',encoding='utf-8')
        label,label2 = label_recognition(label1)
        writer = pd.ExcelWriter('three_people_same_label.xlsx')
        label2.to_excel(writer,'diabetes')
        writer.save()
        new_label = label.drop('分词2',axis=1)
        new_label.to_csv('label.csv',encoding='utf_8_sig',index=False)
        
    

      9.适应症和不良反应数据字典的构建

    # -*- coding: utf-8 -*-
    
    """
    @Datetime: 2018/1/10
    @Author: Zhang Yafei
    """
    import numpy
    import re
    from scrapy.selector import Selector
    import pandas
    from twisted.web.client import getPage, defer
    from twisted.internet import reactor
    
    pandas.set_option('display.max_columns', None)
    
    data_list = []
    
    n = 0
    
    re_data = pandas.DataFrame(columns=['通用名称', '商品名称', '适应症', '不良反应', 'url'])
    
    
    def parse(content, url):
        """
        详细说明书中提取适应症和不良反应
        :param content:
        :param url:
        :return: 通用名称 商品名称 适应症 不良反应 url
        """
        global n
        n += 1
        print(n, url)
        # text = content.decode('GB2312')
        text = content.decode('gbk')
        selector = Selector(text=text)
        # 注:不同药品之间网页结构有差别,提取的时候应注意
        drug_name = selector.xpath('//dt[text()="【药品名称】"]/following::*[1]').extract_first()
        if not drug_name:
            drug_name = selector.xpath('//dt[text()="【产品名称】"]/following::*[1]').extract_first()
        generic_name = re.findall('通用名称:(.*)<br>', drug_name)[0]
        trade_name = re.findall('商品名称:(.*)<br>', drug_name)[0]
        # trade_name = numpy.NAN
        function = selector.xpath('//dt[text()="【功能主治】"]/following::*[1]').extract_first()
        if function:
            function = re.sub('<.*?>', '', function).strip()  # 功能主治
        else:
            function = numpy.NAN
        indiction = selector.xpath('//dt[text()="【适应症】"]/following::*[1]')
        if indiction:
            indiction = indiction.xpath('string(.)').extract_first().strip().replace('
    ', '')
        else:
            indiction = numpy.NAN
        indictions = indiction if indiction is not numpy.NAN else function
        try:
            adverse_reaction = selector.xpath('//dt[text()="【不良反应】"]/following::*[1]/p/text()').extract_first().strip(
                '。')  # 不良反应
        except AttributeError:
            try:
                adverse_reaction = selector.xpath('//dt[text()="【不良反应】"]/following::*[1]/text()').extract_first().strip(
                    '。')  # 不良反应
                adverse_reaction = re.sub('<.*?>', '', adverse_reaction).strip().replace('
    ', '')
            except AttributeError:
                adverse_reaction = numpy.NAN
        data = {'通用名称': generic_name, '商品名称': trade_name,
                '适应症': indictions, '不良反应': adverse_reaction,
                'url': url,
                }
    
        data_list.append(data)
    
    
    def stop_loop(arg):
        reactor.stop()
    
    
    def main(url_list):
        """
        主函数:利用twisted实现基于事件循环的异步非阻塞IO
        :param url_list:
        :return:
        """
        # 制定任务计划:分配请求任务和添加回调函数
        defered_list = []
        for url in url_list:
            defered = getPage(bytes(url, encoding='utf-8'))
            defered.addCallback(callback=parse, url=url)
            defered_list.append(defered)
    
        # 将任务计划告诉领导和下属,并通知任务结束之后停止
        dlist = defer.DeferredList(defered_list)
        dlist.addBoth(stop_loop)
    
        # 开始执行任务
        reactor.run()
    
    
    if __name__ == '__main__':
        # 1.读取数据url下载响应信息
        # data = pandas.read_excel('three_people_same_label.xlsx')
        # url_list = ['http://ypk.39.net/{}/manual'.format(i) for i in data.药品ID.unique().tolist()]
        # data = pandas.read_excel('drug_dict.xlsx')
        # has_url = set(data.url.tolist())
        # urls = list(set(url_list) - has_url)
        # main(urls)
        #
        # # 2. 将下载信息写入文件
        # df = pandas.DataFrame(data=data_list)
        # df = df.loc[:, ['通用名称','商品名称','适应症','不良反应','url']]
        # result = pandas.concat([data, df])
        # writer = pandas.ExcelWriter('drug_dict.xlsx')
        # result.to_excel(writer, 'drug_dict', index=False)
        # writer.save()
    
        # 3.合并39药品数据和不良反应数据库数据
        # df1 = pandas.read_excel('adverse_reaction_database.xlsx')
        # df2 = pandas.read_excel('drug_dict.xlsx')
        # df2['适应症2'] = numpy.NAN
        # df2['不良反应2'] = numpy.NAN
        # print(df1.药品通用名称)
        # print(df2.通用名称)
        # index = df2.通用名称.apply(lambda x: x in df1.药品通用名称.values)
        # df3 = df2.loc[index, :]
        # df4 = pandas.DataFrame(columns=['药品通用名称', '适应症', '不良反应'])
        # #    df3.通用名称.apply(judge)
        # for k in df3.通用名称.values:
        #     data = df1[df1.药品通用名称 == k]
        #     df4 = df4.append(data, ignore_index=True)
        # writer = pandas.ExcelWriter('drug_dict2.xlsx')
        # df4.to_excel(writer, 'drug_dict', index=False)
        # writer.save()
        # 4.读取drug_dict2.xlsx,合并相关数据
        df4 = pandas.read_excel('drug_dict2.xlsx')
        drug_list = []
        for name in df4.药品通用名称.unique():
            result = df4[df4.药品通用名称 == name]
            indiction = '/'.join(str(s) for s in result.适应症.values if s is not numpy.NAN).strip()
            adverse = '/'.join(str(s) for s in result.不良反应.values if s is not numpy.NAN).strip()
            dict = {
                '药品通用名称': name,
                '适应症': indiction,
                '不良反应': adverse,
            }
            drug_list.append(dict)
        df5 = pandas.DataFrame(data=drug_list)
        df5 = df5.loc[:, ['药品通用名称','适应症','不良反应']]
        writer = pandas.ExcelWriter('database_dict.xlsx')
        df5.to_excel(writer, sheet_name='database_dict', index=False)
        writer.save()
    

      

  • 相关阅读:
    初学OptaPlanner-01- 什么是OptaPlanner?
    初学推荐系统-05-Wide&Deep [附tensorflow的WideDeepModel代码简单实践]
    初学推荐系统-04-FM (因子分解机:多特征的二阶特征交叉)
    初学推荐系统-03- 隐语义模型与矩阵分解
    初学推荐系统-02-协同过滤 (UserCF & ItermCF) -附简单示例和优缺点分析
    [Datawhale 10月] 初学推荐系统-01-概述
    TiDB-BR数据备份和恢复工具
    Oracle-估算运行时间长的耗时操作语句
    Hadoop、Spark——完全分布式HA集群搭建
    Hadoop——集群参数配置详解
  • 原文地址:https://www.cnblogs.com/zhangyafei/p/10266329.html
Copyright © 2011-2022 走看看