zoukankan      html  css  js  c++  java
  • 一个简单的python爬虫,爬取知乎

    一个简单的python爬虫,爬取知乎

    • 主要实现 爬取一个收藏夹 里 所有问题答案下的 图片
    • 文字信息暂未收录,可自行实现,比图片更简单
    • 具体代码里有详细注释,请自行阅读

    项目源码:

      1 # -*- coding:utf-8 -*-
      2 
      3 from spider import SpiderHTML
      4 from multiprocessing import Pool
      5 import sys,urllib,http,os,random,re,time
      6 __author__ = 'waiting'
      7 '''
      8 使用了第三方的类库 BeautifulSoup4,请自行安装
      9 需要目录下的spider.py文件
     10 运行环境:python3.4,windows7
     11 '''
     12 
     13 #收藏夹的地址
     14 url = 'https://www.zhihu.com/collection/30822111'  #page参数改为代码添加
     15 
     16 #本地存放的路径,不存在会自动创建
     17 store_path = 'E:\zhihu收藏夹\会员才知道的世界'
     18 
     19 class zhihuCollectionSpider(SpiderHTML):
     20   def __init__(self,pageStart, pageEnd, url):
     21     self._url = url
     22     self._pageStart = int(pageStart)
     23     self._pageEnd = int(pageEnd)+1
     24     self.downLimit = 0            #低于此赞同的答案不收录
     25 
     26   def start(self):
     27     for page in range(self._pageStart,self._pageEnd):    #收藏夹的页数
     28       url = self._url + '?page='+str(page)
     29       content = self.getUrl(url)
     30       questionList = content.find_all('div',class_='zm-item')
     31       for question in questionList:            #收藏夹的每个问题
     32         Qtitle = question.find('h2',class_='zm-item-title')
     33         if Qtitle is None:                #被和谐了
     34           continue
     35 
     36         questionStr = Qtitle.a.string
     37         Qurl = 'https://www.zhihu.com'+Qtitle.a['href']  #问题题目
     38         Qtitle = re.sub(r'[\/:*?"<>]','#',Qtitle.a.string)      #windows文件/目录名不支持的特殊符号
     39         try:
     40           print('-----正在获取问题:'+Qtitle+'-----')    #获取到问题的链接和标题,进入抓取
     41         except UnicodeEncodeError:
     42           print(r'---问题含有特殊字符无法显示---')
     43         try:
     44           Qcontent = self.getUrl(Qurl)
     45         except:
     46           print('!!!!获取出错!!!!!')
     47           pass
     48         answerList = Qcontent.find_all('div',class_='zm-item-answer  zm-item-expanded')
     49         self._processAnswer(answerList,Qtitle)            #处理问题的答案
     50         time.sleep(5)
     51 
     52 
     53   def _processAnswer(self,answerList,Qtitle):
     54     j = 0      
     55     for answer in answerList:
     56       j = j + 1
     57       
     58       upvoted = int(answer.find('span',class_='count').string.replace('K','000'))   #获得此答案赞同数
     59       if upvoted < self.downLimit:
     60         continue
     61       authorInfo = answer.find('div',class_='zm-item-answer-author-info')        #获取作者信息
     62       author = {'introduction':'','link':''}
     63       try:
     64         author['name'] = authorInfo.find('a',class_='author-link').string       #获得作者的名字
     65         author['introduction'] = str(authorInfo.find('span',class_='bio')['title']) #获得作者的简介
     66         author['link'] = authorInfo.find('a',class_='author-link')['href']      
     67       except AttributeError:
     68         author['name'] = '匿名用户'+str(j)
     69       except TypeError:                                  #简介为空的情况
     70         pass                                     #匿名用户没有链接
     71 
     72       file_name = os.path.join(store_path,Qtitle,'info',author['name']+'_info.txt')
     73       if os.path.exists(file_name):              #已经抓取过
     74         continue
     75   
     76       self.saveText(file_name,'{introduction}
    {link}'.format(**author))      #保存作者的信息
     77       print('正在获取用户`{name}`的答案'.format(**author))
     78       answerContent = answer.find('div',class_='zm-editable-content clearfix')
     79       if answerContent is None:                #被举报的用户没有答案内容
     80         continue
     81   
     82       imgs = answerContent.find_all('img')
     83       if len(imgs) == 0:                    #答案没有上图
     84         pass
     85       else:
     86         self._getImgFromAnswer(imgs,Qtitle,**author)
     87 
     88   #收录图片
     89   def _getImgFromAnswer(self,imgs,Qtitle,**author):
     90     i = 0
     91     for img in imgs:
     92       if 'inline-image' in img['class']:          #不抓取知乎的小图
     93         continue
     94       i = i + 1
     95       imgUrl = img['src']
     96       extension = os.path.splitext(imgUrl)[1]
     97       path_name = os.path.join(store_path,Qtitle,author['name']+'_'+str(i)+extension)
     98       try:
     99         self.saveImg(imgUrl,path_name)          #捕获各种图片异常,流程不中断
    100       except:                  
    101         pass
    102         
    103   #收录文字
    104   def _getTextFromAnswer(self):
    105     pass
    106 
    107 #命令行下运行,例:zhihu.py 1 5   获取1到5页的数据
    108 if __name__ == '__main__':
    109   page, limit, paramsNum= 1, 0, len(sys.argv)
    110   if paramsNum>=3:
    111     page, pageEnd = sys.argv[1], sys.argv[2]
    112   elif paramsNum == 2:
    113     page = sys.argv[1]
    114     pageEnd = page
    115   else:
    116     page,pageEnd = 1,1
    117 
    118   spider = zhihuCollectionSpider(page,pageEnd,url)
    119   spider.start()

    很多初学者,对Python的概念都是模糊不清的,C语言、Python能做什么,学的时候,该按照什么线路去学习,学完往哪方面发展,想深入了解,详情可以点击有道云笔记链接了解:http://note.youdao.com/noteshare?id=e4fa02e7b56d7909a27674cdb3da08aa

  • 相关阅读:
    BT656与BT1120的区别
    Arria10中PHY的时钟线结构
    Arria10中的IOPLL与fPLL
    Nios内部RAM固化配置
    实现1sym转换成2个sym送给CVI(VGA数据)
    embeded_2_separate_sync
    动态规划--青蛙跳
    动态规划 0--1 背包问题
    模拟题
    动态规划--最大子段和
  • 原文地址:https://www.cnblogs.com/ITbiancheng/p/12095464.html
Copyright © 2011-2022 走看看