zoukankan      html  css  js  c++  java
  • python百度贴吧爬虫

    # -*- coding: utf-8 -*-
    #coding=utf-8
    
    import urllib
    import urllib2
    import re
    import thread
    import time
    
    class BDTB:
        def __init__(self,baseurl,seeLz):
            self.baseUrl=baseurl
            self.seeLz='?see_lz='+str(seeLz)
            self.Tool=Tool()
        def getPage(self,pageNum):
            try:
                url=self.baseUrl+self.seeLz+'&pn='+str(pageNum)
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                return response.read()
            except urllib2.URLError, e:
                print "链接网络失败"+e.reason
                return None
        def getTitle(self):
            html=self.getPage(1)
            pattern = re.compile('core_title_txt pull-left text-overflow.*?>(.*?)</h3>',re.S)
            result =re.search(pattern,html)
            if result:
                print result.group(1)
            else:
                return None
        def getContent(self,page):
            pattern  =re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
            items = re.findall(pattern,page)
            floor=1
            for i in items:
                print floor,u'楼--------------------------------------------
    '
                print self.Tool.replace(i)
                floor+=1
    
    
    
    
    class Tool:
        #去除img标签,1-7位空格,&nbsp;
        removeImg = re.compile('<img.*?>| {1,7}|&nbsp;')
        #删除超链接标签
        removeAddr = re.compile('<a.*?>|</a>')
        #把换行的标签换为
        replaceLine = re.compile('<tr>|<div>|</div>|</p>')
        #将表格制表<td>替换为
        replaceTD= re.compile('<td>')
        #将换行符或双换行符替换为
        replaceBR = re.compile('<br><br>|<br>')
        #将其余标签剔除
        removeExtraTag = re.compile('<.*?>')
        #将多行空行删除
        removeNoneLine = re.compile(' +')
    
    
        def replace(self,x):
    
    
            x = re.sub(self.removeImg,"",x)
    
    
            x = re.sub(self.removeAddr,"",x)
    
    
            x = re.sub(self.replaceLine," ",x)
    
    
            x = re.sub(self.replaceTD," ",x)
    
    
            x = re.sub(self.replaceBR," ",x)
    
    
            x = re.sub(self.removeExtraTag,"",x)
    
    
            x = re.sub(self.removeNoneLine," ",x)
    
    
            #strip()将前后多余内容删除
    
    
            return x.strip()
    
    
    
    
    baseURL = 'http://tieba.baidu.com/p/3138733512'
    bdtb = BDTB(baseURL,2)
    bdtb.getContent(bdtb.getPage(2))
  • 相关阅读:
    资料工作手册
    这么点破玩艺,昨天我为了学会它,花了六小时
    压力太大,使人过早衰老
    这是真的么。
    我可怜的好友。。。
    操了,上个网怎么就这么憋屈呢
    编程习惯,代码风格,其实很重要
    其实,我并不喜欢脚本语言
    咱也起一卦,看看北京是否还会继续下雨
    我一点不偏激,只是,我从不抱有侥幸思想
  • 原文地址:https://www.cnblogs.com/norm/p/7426279.html
Copyright © 2011-2022 走看看