zoukankan      html  css  js  c++  java
  • 【爬虫】BeautifulSoup之爬取百度贴吧的帖子

    在网上看到爬百度贴吧的例子,仿照写了一个用BeautifulSoup实现的,直接上代码吧

    #coding:gbk
    import urllib2
    from bs4 import BeautifulSoup
    import re
    import os  
    
    class TiebatoTxt:
        def __init__(self, url, seeLZ):
            #传入url
            self.url = url 
            #是否只看楼主
             self.seeLZ = '?see_lz='+str(seeLZ)
            self.floor = 1
            self.File = None
            self.defaultTitle = "百度贴吧"
    
        #获得每一页的BeautifulSoup对象
        def get_body(self, pageNum):
            url = self.url + self.seeLZ  + '&pn=' + str(pageNum)
            req = urllib2.Request(url)
            try :
                html = urllib2.urlopen(req)
            except (urllib2.HTTPError, urllib2.URLError) as e:
                print u"获取帖子链接错误"
                return None
            try:
                bsObj = BeautifulSoup(html, "html.parser")        
            except AttributeError as e:
                print u"获得BeautifulSoup对象错误"
                return None
            return bsObj
        
        #获得帖子标题
        def find_title(self, page):
            name = page.find("head").find("title").get_text()
            if name:
                return name
            else:
                return None
            
        #获取帖子共有多少页
        def get_pagenum(self, page):
            pageinfoList= page.findAll("li", {"class":"l_reply_num"})
            if pageinfoList is not None:
                for info in pageinfoList:
                    span = info.findAll("span")
                    if span is not None:
                        return span[1].get_text().encode("gbk")
            else:
                print "pageinfoList is none"
                
        #获得每一楼层的内容
        def get_content(self, page):
            div = page.findAll("div", {"id":re.compile("post_content_.*?")})
            contents = []
            for item in div:
                floorLine = "
    
    " + str(self.floor) + u"------------------------------------------------------
    
    "
                contents.append(floorLine)
                con = item.getText("
    ", strip=True).encode("gbk", "ignore")#忽略一些特殊字符
                self.floor = self.floor + 1
                txturl = None
                txturl = item.findAll("a")
                #有些词带链接,去掉链接
                if txturl:
                    for i in txturl:
                        word = i.getText(strip=True).encode("gbk", "ignore")
                        con = con.replace(("
    %s
    "%word), word)
                            
                contents.append(con)
            return contents
                #print item.get_text(strip=True)
                
        def setFileTitle(self,title):
            #如果标题不是为None,即成功获取到标题      
            if title is not None:
                title = title.replace('/', '')
                self.File = open(os.path.join(os.getcwd(), (title + ".txt")),"w+")
            else:
                self.File = open(os.path.join(os.getcwd(), (self.defaultTitle + ".txt")),"w+")
                
        def writetotxt(self,contents):
            #向文件写入每一楼的信息
            for item in contents:
                self.File.write(item)
    
        def start(self):
            indexPage = self.get_body(1)
            pageNum = self.get_pagenum(indexPage)
            title = self.find_title(indexPage)
            self.setFileTitle(title)
            if pageNum == None:
                print "URL已失效,请重试"
                return
            try:
                print "该帖子共有" + str(pageNum) + ""
                for i in range(1,int(pageNum)+1):
                    print "正在写入第" + str(i) + "页数据"
                    page = self.get_body(i)
                    contents = self.get_content(page)
                    self.writetotxt(contents)
            #出现写入异常
            except IOError,e:
                print "写入异常,原因" + e.message
            finally:
                print "写入任务完成"
    
    
    #270051025
    if __name__ == '__main__':     
        print u"请输入帖子代号"
        baseURL = 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/'))
        seeLZ = raw_input(u"是否只获取楼主发言,是输入1,否输入0:")        
        t = TiebatoTxt(baseURL, seeLZ)
        b = t.start()
  • 相关阅读:
    11.2.0.3 实例启动现在提供Large Pages Information大内存页信息了
    RAC中增大db_cache_size引发的ORA04031错误
    11.2 中Oracle Cluster Registry(OCR)可选的存储设备
    所见即所得
    新的开始
    关于asp.net(C#)与C#.net之间的区别
    Asp.Net学习之母板
    Javascript闭包(Closure)初步接触
    JavaScript解析JSON及XML
    JavaScript之JSON详解
  • 原文地址:https://www.cnblogs.com/zoro-robin/p/5788595.html
Copyright © 2011-2022 走看看