zoukankan      html  css  js  c++  java
  • 简单爬取微医网

    一.利用request和xpath爬取微医网

      

    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    #author tom
    import requests
    from lxml import etree
    import pymongo
    
    #爬取微医网类
    class DoctorSpider():
        #初始化应该具有的一些属性
        def __init__(self):
            self.headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
            self.base_url='https://www.guahao.com/expert/all/%E5%85%A8%E5%9B%BD/all/%E4%B8%8D%E9%99%90/p'
            self.page_num=1
            self.info_list = []
            self.client=pymongo.MongoClient(host='127.0.0.1',port=27017)
            self.db=self.client['test']
            #抓取网页数据
        def crwal(self):
            print('正在爬取第{}页'.format(self.page_num))
            url=self.base_url+str(self.page_num)
            res=requests.get(url=url,headers=self.headers).text
            #中国有38页
            if self.page_num<=38:
                self.page_num+=1
            return res
        # 网页内容解析
        def parse(self,res):
            page_text=res
            tree=etree.HTML(page_text)
            li_list=tree.xpath('//div[@class="g-doctor-items to-margin"]/ul/li')
            for li in li_list:
                name=li.xpath("./div[2]/a/text()")[0]
                skill=li.xpath("./div[2]/div[1]/p/text()")[0]
                #解析出来的有很多空格,回车,制表符,
                skill=skill.replace('
    ','').replace('
    ','').replace(' ','').strip()
                position=li.xpath("./div[1]/dl/dt/text()")[1]
                position = position.replace('
    ','').strip()
                score=li.xpath("./div[1]/dl/dd/p[3]/span/em/text()")[0]
                num=li.xpath("./div[1]/dl/dd/p[3]/span/i/text()")[0]
                office=li.xpath("./div[1]/dl/dd/p[1]/text()")[0]
                hospital=li.xpath("./div[1]/dl/dd/p[2]/span/text()")[0]
                dic={
                    'name':name,
                    'skill':skill,
                    'position':position,
                    'score':score,
                    'num':num,
                    'office':office,
                    'hospital':hospital,
                }
                self.save(dic)
    
        #保存函数(保存到mongodb)
        def save(self,dic):
            collection=self.db['weiyiwang']
            collection.save(dic)
    
        #项目启动函数
        def run(self):
            while self.page_num<=38:
                response=self.crwal()
                a=self.parse(response)
    
    if __name__ == '__main__':
        doctor=DoctorSpider()
        doctor.run()
  • 相关阅读:
    1094. Car Pooling
    121. Best Time to Buy and Sell Stock
    58. Length of Last Word
    510. Inorder Successor in BST II
    198. House Robber
    57. Insert Interval
    15. 3Sum java solutions
    79. Word Search java solutions
    80. Remove Duplicates from Sorted Array II java solutions
    34. Search for a Range java solutions
  • 原文地址:https://www.cnblogs.com/tjp40922/p/10574746.html
Copyright © 2011-2022 走看看