zoukankan      html  css  js  c++  java
  • 简单爬取微医网

    一.利用request和xpath爬取微医网

      

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    #author tom
    import requests
    from lxml import etree
    import pymongo
    
    #爬取微医网类
    class DoctorSpider():
        #初始化应该具有的一些属性
        def __init__(self):
            self.headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
            self.base_url='https://www.guahao.com/expert/all/%E5%85%A8%E5%9B%BD/all/%E4%B8%8D%E9%99%90/p'
            self.page_num=1
            self.info_list = []
            self.client=pymongo.MongoClient(host='127.0.0.1',port=27017)
            self.db=self.client['test']
            #抓取网页数据
        def crwal(self):
            print('正在爬取第{}页'.format(self.page_num))
            url=self.base_url+str(self.page_num)
            res=requests.get(url=url,headers=self.headers).text
            #中国有38页
            if self.page_num<=38:
                self.page_num+=1
            return res
        # 网页内容解析
        def parse(self,res):
            page_text=res
            tree=etree.HTML(page_text)
            li_list=tree.xpath('//div[@class="g-doctor-items to-margin"]/ul/li')
            for li in li_list:
                name=li.xpath("./div[2]/a/text()")[0]
                skill=li.xpath("./div[2]/div[1]/p/text()")[0]
                #解析出来的有很多空格,回车,制表符,
                skill=skill.replace('
    ','').replace('
    ','').replace(' ','').strip()
                position=li.xpath("./div[1]/dl/dt/text()")[1]
                position = position.replace('
    ','').strip()
                score=li.xpath("./div[1]/dl/dd/p[3]/span/em/text()")[0]
                num=li.xpath("./div[1]/dl/dd/p[3]/span/i/text()")[0]
                office=li.xpath("./div[1]/dl/dd/p[1]/text()")[0]
                hospital=li.xpath("./div[1]/dl/dd/p[2]/span/text()")[0]
                dic={
                    'name':name,
                    'skill':skill,
                    'position':position,
                    'score':score,
                    'num':num,
                    'office':office,
                    'hospital':hospital,
                }
                self.save(dic)
    
        #保存函数(保存到mongodb)
        def save(self,dic):
            collection=self.db['weiyiwang']
            collection.save(dic)
    
        #项目启动函数
        def run(self):
            while self.page_num<=38:
                response=self.crwal()
                a=self.parse(response)
    
    if __name__ == '__main__':
        doctor=DoctorSpider()
        doctor.run()
  • 相关阅读:
    Asp.Net开发小技巧汇总
    .net缓存
    C# .Net 2.0实例学习:WebBrowser页面与WinForm交互技巧(二)
    字符和数组
    jvm调优原则
    ASP.NET页面刷新的实现方法
    js禁止右键和复制功能
    动态定义SL DataGrid Columns [转]
    DataGrid 内嵌ComboBox动态数据联动
    C#对象序列化(2)
  • 原文地址:https://www.cnblogs.com/tjp40922/p/10574746.html
Copyright © 2011-2022 走看看