zoukankan      html  css  js  c++  java
  • Python爬虫爬取百度贴吧的帖子

    同样是参考网上教程,编写爬取贴吧帖子的内容,同时把爬取的帖子保存到本地文档:

    #!/usr/bin/python
    #_*_coding:utf-8_*_
    import urllib
    import urllib2
    import re
    import sys

    reload(sys)
    sys.setdefaultencoding("utf-8")
    #处理页面标签,去除图片、超链接、换行符等
    class Tool:
    #去除img标签,7位长空格
    removeImg = re.compile('<img.*?>| {7}|')
    #删除超链接标签
    removeAddr = re.compile('<a.*?>|</a>')
    #把换行的标签替换为
    replaceLine = re.compile('<tr>|<div>|</div>|</p>')
    #把表格制表<td>替换为
    replaceTD = re.compile('<td>')
    #把段落开头换为 加两个空格
    replacePara = re.compile('<p.*?>')
    #把换行符或双换行符替换为
    replaceBR = re.compile('<br><br>|<br>')
    #将其余标签剔除
    removeET = re.compile('<.*?>')

    #去除匹配到Tool
    def replace(self,x):
    x = re.sub(self.removeImg,"",x)
    x = re.sub(self.removeAddr,"",x)
    #x = re.sub(self.replaceLine," ",x)
    #x = re.sub(self.replaceTD," ",x)
    #x = re.sub(self.replacePara," ",x)
    #x = re.sub(self.replaceBR," ",x)
    x = re.sub(self.removeET,"",x)
    #strip()将前后多余内容删除
    return x.strip().encode('utf-8')
    #百度贴吧爬虫练习
    class BDTB:

    #初始化,传入地址,是否只看楼主的参数
    def __init__(self,baseUrl,seeLz,floorTag):
    #base链接地址
    self.baseURL = baseUrl
    #是否只看楼主
    self.seeLZ = '?seelz=' + str(seeLz)
    #HTML剔除标签工具Tool
    self.tool = Tool()
    #全局file变量,文件写入操作对象
    self.file = None
    #楼层标识,初始化为1
    self.floor = 1
    #默认的标题,如果没有成功获取到标题的话则会用这个标题
    self.defaultTitle = u"百度贴吧"
    #是否写入楼分隔符的标记
    self.floorTag = floorTag

    #传入页码,获取该页帖子的代码
    def getPage(self,pageNum):
    try:
    url = self.baseURL + self.seeLZ + '&pn=' + str(pageNum)
    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    tbPage = response.read().decode('utf-8')
    #print tbPage
    return tbPage
    #链接报错的原因
    except urllib2.URLError, e:
    if hasattr(e,"reason"):
    print u'链接百度贴吧失败,错误原因:',e.reason
    return None

    #获取帖子标题
    def getTitle(self,page):
    page = self.getPage(1)
    #正则匹配贴吧标题
    pattern = re.compile('<h3 class="core_title_txt.*?>(.*?)</h3>',re.S)
    result = re.search(pattern,page)
    if result:
    #输出标题
    #print result.group(1)
    return result.group(1).strip()
    else:
    return None

    #获取帖子一共有多少页
    def getPageNum(self,page):
    page = self.getPage(1)
    #正则匹配帖子总共有多少页
    pattern = re.compile('<li class="l_reply_num.*?</span>.*?<span.*?>(.*?)</span>',re.S)
    result = re.search(pattern,page)
    if result:
    #输出页码数
    #print result.group(1)
    return result.group(1).strip()
    else:
    print None

    #获取帖子每一个楼层的内容
    def getContent(self,page):
    #正则匹配每一个楼层的内容
    pattern = re.compile('<div id="post_content.*?>(.*?)</div>',re.S)
    items = re.findall(pattern,page)
    #floor = 1
    contents = []
    for item in items:
    #将文本进行去除标签处理,同时在前后加入换行符
    content = " " + self.tool.replace(item) + " "
    contents.append(content.encode('utf-8'))
    #print floor,u"楼-----------------------"
    #print content
    #floor += 1
    return contents

    #设置文件的标题
    def setFileTitle(self,title):
    #如果标题不是None,即成功获取到标题
    if title is not None:
    self.file = open(title + ".txt","w+")
    else:
    self.file = open(self.defaultTitle + ".txt","w+")

    #向文件写入每一楼层的信息
    def writeData(self,contents):
    #遍历楼层
    for item in contents:
    if self.floorTag == '1':
    #楼之间使用的分隔符
    floorLine = " --------------" + str(self.floor) + "楼----------------- "
    self.file.write(unicode(floorLine,"utf-8"))
    self.file.write(unicode(item,"utf-8"))
    self.floor += 1

    def start(self):
    indexPage = self.getPage(1)
    pageNum = self.getPageNum(indexPage)
    title = self.getTitle(indexPage)
    self.setFileTitle(title)
    if pageNum == None:
    print "URL已失效,请重试"
    return
    try:
    print "该帖子共有" + str(pageNum) + "页"
    for i in range(1,int(pageNum) + 1):
    print "正在写入第" + str(i) + "页数据"
    page = self.getPage(i)
    contents = self.getContent(page)
    self.writeData(contents)
    except IOError,e:
    print "写入异常,原因" + e.message
    finally:
    print "写入任务完成"
    print u"请输入帖子代号"
    baseURL = 'http://tieba.baidu.com/p/' + str(raw_input(u'http://tieba.baidu.com/p/'))
    seeLZ = raw_input("是否只获取楼主发言,是输入1,否输入0 ")
    floorTag = raw_input("是否写入楼层信息,是输入1,否输入0 ")
    bdtb = BDTB(baseURL,seeLZ,floorTag)
    bdtb.start()

  • 相关阅读:
    [LeetCode]2. Add Two Numbers链表相加
    Integration between Dynamics 365 and Dynamics 365 Finance and Operation
    向视图列添加自定义图标和提示信息 -- PowerApps / Dynamics365
    Update the Power Apps portals solution
    Migrate portal configuration
    Use variable to setup related components visible
    Loyalty management on Retail of Dynamic 365
    Modern Fluent UI controls in Power Apps
    Change screen size and orientation of a canvas app in Power App
    Communication Plan for Power Platform
  • 原文地址:https://www.cnblogs.com/lebb1993/p/6080332.html
Copyright © 2011-2022 走看看