zoukankan      html  css  js  c++  java
  • 糗事百科python爬虫

    # -*- coding: utf-8 -*-
    #coding=utf-8
    
    import urllib
    import urllib2
    import re
    import thread
    import time
    
    class QSBK:
        def __init__(self):
            self.pageIndex=1
            self.user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
            self.header={'User-Agent':self.user_agent}
            self.store=[]
            self.enable=False
        def getPage(self,pageIndex):
            try:
                url = 'http://www.qiushibaike.com/hot/page/'+str(pageIndex)
                request = urllib2.Request(url,headers=self.header)
                response = urllib2.urlopen(request)
                pageHtml =response.read().decode('utf-8')
                return pageHtml
            except urllib2.URLError,e:
                print '链接网络失败'+e.reason
                return None
        def getPageItem(self,pageIndex):
            page = self.getPage(pageIndex)
            if page==None:
                print "页面获得失败"
                return  None
            pattern = re.compile('<div class="author.*?<a.*?<img.*?</a>.*?<a.*?<h2>(.*?)</h2>.*?class="content.*?<span>s*(.*?)s*</span>',re.S)
            items = re.findall(pattern, page)
            pageStories = []
            for item in items:
                pageStories.append([item[0],item[1]])
            return pageStories
        def loadPage(self):
            if self.enable==True:
                if len(self.store)<2:
                    pageStories = self.getPageItem(self.pageIndex)
                    if pageStories!=None:
                        self.store.append(pageStories)
                        self.pageIndex+=1
        def getOneStory(self,pageStories):
            for story in pageStories:
                input= raw_input()
                self.loadPage()
                if input=='Q':
                    self.enable=False
                    return
                print u'%s %s'%(story[0],story[1])
        def start(self):
            print u"正在读取糗事百科的数据,按Q退出"
            self.enable=True
            self.loadPage()
            nowPage=0
            while self.enable:
                if len(self.store)>0:
                    pageStore=self.store[0]
                    nowPage+=1
                    del self.store[0]
                    self.getOneStory(pageStore)
    
    
    
    spider =QSBK()
    spider.start()
  • 相关阅读:
    图片懒加载技术
    验证码处理
    基于requests模块的cookie,session和线程池爬取
    最快理解
    Golang
    Django REST framework
    Django REST framework
    Django
    Django
    搭建邮件服务器 Postfix + Dovecot (CentOS)
  • 原文地址:https://www.cnblogs.com/norm/p/7425193.html
Copyright © 2011-2022 走看看