zoukankan      html  css  js  c++  java
  • Python爬虫实战(一):爬糗事百科段子

    代码:

    # _*_ coding:utf-8 _*_
    import urllib2
    import re
    from datetime import datetime
    
    class QSBK:
        def __init__(self):
            self.pageIndex = 1
            self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            self.headers = {'User-Agent':self.user_agent}
            self.stories = []
            self.enable = False
            
        def getPage(self,pageIndex):
            try:
                url = 'http://www.qiushibaike.com/hot/page'+str(pageIndex)
                request = urllib2.Request(url,headers = self.headers)
                response = urllib2.urlopen(request)
                pageCode = response.read().decode('utf-8')
                return pageCode
            except urllib2.URLError,e:
                if hasattr(e,'reason'):
                    print u"QSBK connect Error,reason: ",e.reason
                    return None
    
        def getPageItems(self,pageIndex):
            pageCode = self.getPage(pageIndex)
            if not pageCode:
                print "Page Loading Error..."
                return None
            pattern = re.compile('<div.*?author clearfix">.*?<a.*?<img.*?>(.*?)</a>.*?<a.*?<h2>(.*?)</h2>.*?</a>.*?<div.*?'+'content">(.*?)<!--(.*?)-->.*?</div>.*?<div class="stats.*?class="number">(.*?)</i>',re.S)
            items = re.findall(pattern,pageCode)
            pageStories = []
            for item in items:
                haveImg = re.search("img",item[0])
                if not haveImg:
                    replaceBR = re.compile('<br/>')
                    text = re.sub(replaceBR,"
    ",item[2])
                    pageStories.append([item[1].strip(),text.strip(),item[3].strip(),item[4].strip()])
            return pageStories
    
        def loadPage(self):
            if self.enable == True:
                if len(self.stories) < 2:
                    pageStories = self.getPageItems(self.pageIndex)
                    if pageStories:
                        self.stories.append(pageStories)
                        self.pageIndex += 1
    
        def getOneStory(self,pageStories,page):
             for story in pageStories:
                 input = raw_input()
                 self.loadPage()
                 if input == 'Q':
                     self.enable = False
                     return
                 print u"第%d页	发布人:%s	发布时间:%s	赞:%s
    %s" %(page,story[0],datetime.fromtimestamp(int(story[2])),story[3],story[1])
                 
        def start(self):
             print u"正在读取糗事百科,按回车查看新段子,Q退出"
             self.enable = True
             self.loadPage()
             nowPage = 0
             while self.enable:
                 if len(self.stories)>0:
                     pageStories = self.stories[0]
                     nowPage += 1
                     del self.stories[0]
                     self.getOneStory(pageStories,nowPage)
    
    spider = QSBK()
    spider.start()
    

      

  • 相关阅读:
    JavaScript Web页面内容导出到Word、Excel (转载)
    合并多个声音文件
    龙舟记
    c#获取应用程序目录
    ADO.NET数据库连接池研究(一) 查看连接池数 (转)
    UpdatePanel 内控件 更新“外的”控件【转】
    web客户端播放wav文件
    解决DataList控件无缝滚动图片(转)
    关闭sleeping连接进程在Sql Server2000数据库存储过程中(转)
    win7下没有注册类别 (异常来自 HRESULT:0x80040154 (REGDB_E_CLASSNOTREG))
  • 原文地址:https://www.cnblogs.com/AndyJee/p/4997101.html
Copyright © 2011-2022 走看看