zoukankan      html  css  js  c++  java
  • 简单爬取微医网

    一.利用request和xpath爬取微医网

      

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    #author tom
    import requests
    from lxml import etree
    import pymongo
    
    #爬取微医网类
    class DoctorSpider():
        #初始化应该具有的一些属性
        def __init__(self):
            self.headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
            self.base_url='https://www.guahao.com/expert/all/%E5%85%A8%E5%9B%BD/all/%E4%B8%8D%E9%99%90/p'
            self.page_num=1
            self.info_list = []
            self.client=pymongo.MongoClient(host='127.0.0.1',port=27017)
            self.db=self.client['test']
            #抓取网页数据
        def crwal(self):
            print('正在爬取第{}页'.format(self.page_num))
            url=self.base_url+str(self.page_num)
            res=requests.get(url=url,headers=self.headers).text
            #中国有38页
            if self.page_num<=38:
                self.page_num+=1
            return res
        # 网页内容解析
        def parse(self,res):
            page_text=res
            tree=etree.HTML(page_text)
            li_list=tree.xpath('//div[@class="g-doctor-items to-margin"]/ul/li')
            for li in li_list:
                name=li.xpath("./div[2]/a/text()")[0]
                skill=li.xpath("./div[2]/div[1]/p/text()")[0]
                #解析出来的有很多空格,回车,制表符,
                skill=skill.replace('
    ','').replace('
    ','').replace(' ','').strip()
                position=li.xpath("./div[1]/dl/dt/text()")[1]
                position = position.replace('
    ','').strip()
                score=li.xpath("./div[1]/dl/dd/p[3]/span/em/text()")[0]
                num=li.xpath("./div[1]/dl/dd/p[3]/span/i/text()")[0]
                office=li.xpath("./div[1]/dl/dd/p[1]/text()")[0]
                hospital=li.xpath("./div[1]/dl/dd/p[2]/span/text()")[0]
                dic={
                    'name':name,
                    'skill':skill,
                    'position':position,
                    'score':score,
                    'num':num,
                    'office':office,
                    'hospital':hospital,
                }
                self.save(dic)
    
        #保存函数(保存到mongodb)
        def save(self,dic):
            collection=self.db['weiyiwang']
            collection.save(dic)
    
        #项目启动函数
        def run(self):
            while self.page_num<=38:
                response=self.crwal()
                a=self.parse(response)
    
    if __name__ == '__main__':
        doctor=DoctorSpider()
        doctor.run()
  • 相关阅读:
    SQL Server 2005中的分区表(六):将已分区表转换成普通表
    关于SQL Server中分区表的文件与文件组的删除(转)
    MySQL修改root密码的几种方法
    Aptana 插件 for Eclipse 4.4
    IT励志与指导文章合集(链接)
    正则表达式(转)
    《疯狂原始人》温馨而搞笑片段截图
    指针函数与函数指针的区别(转)
    Linux内核@系统组成与内核配置编译
    2015年我国IT行业发展趋势分析(转)
  • 原文地址:https://www.cnblogs.com/tjp40922/p/10574746.html
Copyright © 2011-2022 走看看