zoukankan      html  css  js  c++  java
  • Python爬虫实战(一):爬糗事百科段子

    代码:

    # _*_ coding:utf-8 _*_
    import urllib2
    import re
    from datetime import datetime
    
    class QSBK:
        def __init__(self):
            self.pageIndex = 1
            self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            self.headers = {'User-Agent':self.user_agent}
            self.stories = []
            self.enable = False
            
        def getPage(self,pageIndex):
            try:
                url = 'http://www.qiushibaike.com/hot/page'+str(pageIndex)
                request = urllib2.Request(url,headers = self.headers)
                response = urllib2.urlopen(request)
                pageCode = response.read().decode('utf-8')
                return pageCode
            except urllib2.URLError,e:
                if hasattr(e,'reason'):
                    print u"QSBK connect Error,reason: ",e.reason
                    return None
    
        def getPageItems(self,pageIndex):
            pageCode = self.getPage(pageIndex)
            if not pageCode:
                print "Page Loading Error..."
                return None
            pattern = re.compile('<div.*?author clearfix">.*?<a.*?<img.*?>(.*?)</a>.*?<a.*?<h2>(.*?)</h2>.*?</a>.*?<div.*?'+'content">(.*?)<!--(.*?)-->.*?</div>.*?<div class="stats.*?class="number">(.*?)</i>',re.S)
            items = re.findall(pattern,pageCode)
            pageStories = []
            for item in items:
                haveImg = re.search("img",item[0])
                if not haveImg:
                    replaceBR = re.compile('<br/>')
                    text = re.sub(replaceBR,"
    ",item[2])
                    pageStories.append([item[1].strip(),text.strip(),item[3].strip(),item[4].strip()])
            return pageStories
    
        def loadPage(self):
            if self.enable == True:
                if len(self.stories) < 2:
                    pageStories = self.getPageItems(self.pageIndex)
                    if pageStories:
                        self.stories.append(pageStories)
                        self.pageIndex += 1
    
        def getOneStory(self,pageStories,page):
             for story in pageStories:
                 input = raw_input()
                 self.loadPage()
                 if input == 'Q':
                     self.enable = False
                     return
                 print u"第%d页	发布人:%s	发布时间:%s	赞:%s
    %s" %(page,story[0],datetime.fromtimestamp(int(story[2])),story[3],story[1])
                 
        def start(self):
             print u"正在读取糗事百科,按回车查看新段子,Q退出"
             self.enable = True
             self.loadPage()
             nowPage = 0
             while self.enable:
                 if len(self.stories)>0:
                     pageStories = self.stories[0]
                     nowPage += 1
                     del self.stories[0]
                     self.getOneStory(pageStories,nowPage)
    
    spider = QSBK()
    spider.start()
    

      

  • 相关阅读:
    grunt in webstorm
    10+ Best Responsive HTML5 AngularJS Templates
    响应式布局
    responsive grid
    responsive layout
    js event bubble and capturing
    Understanding Service Types
    To add private variable to this Javascript literal object
    Centering HTML elements larger than their parents
    java5 新特性
  • 原文地址:https://www.cnblogs.com/AndyJee/p/4997101.html
Copyright © 2011-2022 走看看