zoukankan      html  css  js  c++  java
  • Python爬虫实战(二):爬百度贴吧

    代码:

    # _*_ coding:utf-8 _*_
    import urllib
    import urllib2
    import re
    class Tool:
        removingImg = re.compile('<img.*?>| {7}|')
        removingAddr = re.compile('<a.*?>|</a>')
        replaceLine = re.compile('<tr>|<div>|</div>|</p>')
        replaceTD = re.compile('<td>')
        replacePara = re.compile('<p.*?>')
        replaceBR = re.compile('<br><br>|<br>')
        removeExtraTag = re.compile('<.*?>')
    
        def replace(self,x):
            x = re.sub(self.removingImg,"",x)
            x = re.sub(self.removingAddr,"",x)
            x = re.sub(self.replaceLine,"
    ",x)
            x = re.sub(self.replaceTD,"	",x)
            x = re.sub(self.replacePara,"
    ",x)
            x = re.sub(self.replaceBR,"
    ",x)
            x = re.sub(self.removeExtraTag,"",x)
            return x.strip()
            
    class BDTB:
        def __init__(self,baseUrl,seeLZ,floorTag):
            self.baseURL = baseUrl
            self.seeLZ = '?see_lz='+str(seeLZ)
            self.tool = Tool()
            self.file = None
            self.floor = 1
            self.defaultTitle = '百度贴吧'
            self.floorTag = floorTag
    
        def getPage(self,pageNum):
            try:
                url = self.baseURL + self.seeLZ + '&pn=' + str(pageNum)
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                pageCode = response.read().decode('utf-8')
                #print pageCode
                return pageCode
            except urllib2.URLError,e:
                if hasattr(e,"reason"):
                    print u"连接百度贴吧失败,错误原因",e.reason
                    return None
    
        def getTitle(self,page):
             pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
             result = re.search(pattern,page)
             if result:
                 #print result.group(1)
                 return result.group(1).strip()
             else:
                 #print "Not match"
                 return None
    
        def getPageNum(self,page):
             pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S)
             result = re.search(pattern,page)
             if result:
                 #print result.group(1)
                 return result.group(1).strip()
             else:
                 #print "Not match"
                 return None
    
        def getContent(self,page):
            pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
            items = re.findall(pattern,page)
            contents = []
            for item in items:
                #print floor,u"楼-----------------------------------------
    "
                content = "
    " + self.tool.replace(item) + "
    "
                contents.append(content.encode('utf-8'))
                #floor += 1
            return contents
    
        def setFileTitle(self,title):
            if title is not None:
                self.file = open(title + ".txt","w+")
            else:
                self.file = open(self.defaultTitle + ".txt","w+")
    
        def writeData(self,contents):
            for item in contents:
                if self.floorTag == '1':
                    floorLine = "
    " + str(self.floor) + u"------------------------------------------------------
    "
                    self.file.write(floorLine)
                self.file.write(item);
                self.floor += 1
    
        def start(self):
            indexPage = self.getPage(1)
            pageNum = self.getPageNum(indexPage)
            title = self.getTitle(indexPage)
            self.setFileTitle(title)
            if pageNum == None:
                print "URL已失效,请重试"
                return
            try:
                print "该帖子共有" + str(pageNum) + ""
                for i in range(1,int(pageNum)+1):
                    print "正在写入第" + str(i) + "页数据"
                    page = self.getPage(i)
                    contents = self.getContent(page)
                    self.writeData(contents)
            except IOError,e:
                print "写入异常,原因" + e.message
            finally:
                print "写入任务完成"
    
    
    print u"请输入帖子代号"
    baseURL = "http://tieba.baidu.com/p/" + str(raw_input(u'http://tieba.baidu.com/p/'))
    seeLZ = raw_input("是否只获取楼主发言,是输入1,否输入0
    ")
    floorTag = raw_input("是否写入楼层信息,是输入1,否输入0
    ")
    bdtb = BDTB(baseURL,seeLZ,floorTag)
    bdtb.start()
    #baseURL = 'http://tieba.baidu.com/p/3138733512'
    #bdtb = BDTB(baseURL,1)
    #page = bdtb.getPage(1)
    #bdtb.getTitle()
    #bdtb.getPageNum()
    #bdtb.getContent(page)
  • 相关阅读:
    我的云之旅hadoop集群(3)
    动态域名绑定
    我的云之旅hadoop集群集成Hive(4)
    Axis2实践
    PHP Apache Mysql搭建
    JavaEE程序员必读图书大推
    我的云之旅hadoop集群集成Hbase集群(5)
    本博客总排名进入前100
    关系数据库及NoSql图书大推荐
    Last_IO_Errno: 1032
  • 原文地址:https://www.cnblogs.com/AndyJee/p/5001283.html
Copyright © 2011-2022 走看看