zoukankan      html  css  js  c++  java
  • Python爬虫实战(二):爬百度贴吧

    代码:

    # _*_ coding:utf-8 _*_
    import urllib
    import urllib2
    import re
    class Tool:
        removingImg = re.compile('<img.*?>| {7}|')
        removingAddr = re.compile('<a.*?>|</a>')
        replaceLine = re.compile('<tr>|<div>|</div>|</p>')
        replaceTD = re.compile('<td>')
        replacePara = re.compile('<p.*?>')
        replaceBR = re.compile('<br><br>|<br>')
        removeExtraTag = re.compile('<.*?>')
    
        def replace(self,x):
            x = re.sub(self.removingImg,"",x)
            x = re.sub(self.removingAddr,"",x)
            x = re.sub(self.replaceLine,"
    ",x)
            x = re.sub(self.replaceTD,"	",x)
            x = re.sub(self.replacePara,"
    ",x)
            x = re.sub(self.replaceBR,"
    ",x)
            x = re.sub(self.removeExtraTag,"",x)
            return x.strip()
            
    class BDTB:
        def __init__(self,baseUrl,seeLZ,floorTag):
            self.baseURL = baseUrl
            self.seeLZ = '?see_lz='+str(seeLZ)
            self.tool = Tool()
            self.file = None
            self.floor = 1
            self.defaultTitle = '百度贴吧'
            self.floorTag = floorTag
    
        def getPage(self,pageNum):
            try:
                url = self.baseURL + self.seeLZ + '&pn=' + str(pageNum)
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                pageCode = response.read().decode('utf-8')
                #print pageCode
                return pageCode
            except urllib2.URLError,e:
                if hasattr(e,"reason"):
                    print u"连接百度贴吧失败,错误原因",e.reason
                    return None
    
        def getTitle(self,page):
             pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
             result = re.search(pattern,page)
             if result:
                 #print result.group(1)
                 return result.group(1).strip()
             else:
                 #print "Not match"
                 return None
    
        def getPageNum(self,page):
             pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S)
             result = re.search(pattern,page)
             if result:
                 #print result.group(1)
                 return result.group(1).strip()
             else:
                 #print "Not match"
                 return None
    
        def getContent(self,page):
            pattern = re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
            items = re.findall(pattern,page)
            contents = []
            for item in items:
                #print floor,u"楼-----------------------------------------
    "
                content = "
    " + self.tool.replace(item) + "
    "
                contents.append(content.encode('utf-8'))
                #floor += 1
            return contents
    
        def setFileTitle(self,title):
            if title is not None:
                self.file = open(title + ".txt","w+")
            else:
                self.file = open(self.defaultTitle + ".txt","w+")
    
        def writeData(self,contents):
            for item in contents:
                if self.floorTag == '1':
                    floorLine = "
    " + str(self.floor) + u"------------------------------------------------------
    "
                    self.file.write(floorLine)
                self.file.write(item);
                self.floor += 1
    
        def start(self):
            indexPage = self.getPage(1)
            pageNum = self.getPageNum(indexPage)
            title = self.getTitle(indexPage)
            self.setFileTitle(title)
            if pageNum == None:
                print "URL已失效,请重试"
                return
            try:
                print "该帖子共有" + str(pageNum) + ""
                for i in range(1,int(pageNum)+1):
                    print "正在写入第" + str(i) + "页数据"
                    page = self.getPage(i)
                    contents = self.getContent(page)
                    self.writeData(contents)
            except IOError,e:
                print "写入异常,原因" + e.message
            finally:
                print "写入任务完成"
    
    
    print u"请输入帖子代号"
    baseURL = "http://tieba.baidu.com/p/" + str(raw_input(u'http://tieba.baidu.com/p/'))
    seeLZ = raw_input("是否只获取楼主发言,是输入1,否输入0
    ")
    floorTag = raw_input("是否写入楼层信息,是输入1,否输入0
    ")
    bdtb = BDTB(baseURL,seeLZ,floorTag)
    bdtb.start()
    #baseURL = 'http://tieba.baidu.com/p/3138733512'
    #bdtb = BDTB(baseURL,1)
    #page = bdtb.getPage(1)
    #bdtb.getTitle()
    #bdtb.getPageNum()
    #bdtb.getContent(page)
  • 相关阅读:
    PAT顶级 1015 Letter-moving Game (35分)
    PAT顶级 1008 Airline Routes (35分)(有向图的强连通分量)
    PAT顶级 1025 Keep at Most 100 Characters (35分)
    PAT顶级 1027 Larry and Inversions (35分)(树状数组)
    PAT 顶级 1026 String of Colorful Beads (35分)(尺取法)
    PAT顶级 1009 Triple Inversions (35分)(树状数组)
    Codeforces 1283F DIY Garland
    Codeforces Round #438 A. Bark to Unlock
    Codeforces Round #437 E. Buy Low Sell High
    Codeforces Round #437 C. Ordering Pizza
  • 原文地址:https://www.cnblogs.com/AndyJee/p/5001283.html
Copyright © 2011-2022 走看看