zoukankan      html  css  js  c++  java
  • python百度贴吧爬虫

    # -*- coding: utf-8 -*-
    #coding=utf-8
    
    import urllib
    import urllib2
    import re
    import thread
    import time
    
    class BDTB:
        def __init__(self,baseurl,seeLz):
            self.baseUrl=baseurl
            self.seeLz='?see_lz='+str(seeLz)
            self.Tool=Tool()
        def getPage(self,pageNum):
            try:
                url=self.baseUrl+self.seeLz+'&pn='+str(pageNum)
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                return response.read()
            except urllib2.URLError, e:
                print "链接网络失败"+e.reason
                return None
        def getTitle(self):
            html=self.getPage(1)
            pattern = re.compile('core_title_txt pull-left text-overflow.*?>(.*?)</h3>',re.S)
            result =re.search(pattern,html)
            if result:
                print result.group(1)
            else:
                return None
        def getContent(self,page):
            pattern  =re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
            items = re.findall(pattern,page)
            floor=1
            for i in items:
                print floor,u'楼--------------------------------------------
    '
                print self.Tool.replace(i)
                floor+=1
    
    
    
    
    class Tool:
        #去除img标签,1-7位空格,&nbsp;
        removeImg = re.compile('<img.*?>| {1,7}|&nbsp;')
        #删除超链接标签
        removeAddr = re.compile('<a.*?>|</a>')
        #把换行的标签换为
        replaceLine = re.compile('<tr>|<div>|</div>|</p>')
        #将表格制表<td>替换为
        replaceTD= re.compile('<td>')
        #将换行符或双换行符替换为
        replaceBR = re.compile('<br><br>|<br>')
        #将其余标签剔除
        removeExtraTag = re.compile('<.*?>')
        #将多行空行删除
        removeNoneLine = re.compile(' +')
    
    
        def replace(self,x):
    
    
            x = re.sub(self.removeImg,"",x)
    
    
            x = re.sub(self.removeAddr,"",x)
    
    
            x = re.sub(self.replaceLine," ",x)
    
    
            x = re.sub(self.replaceTD," ",x)
    
    
            x = re.sub(self.replaceBR," ",x)
    
    
            x = re.sub(self.removeExtraTag,"",x)
    
    
            x = re.sub(self.removeNoneLine," ",x)
    
    
            #strip()将前后多余内容删除
    
    
            return x.strip()
    
    
    
    
    baseURL = 'http://tieba.baidu.com/p/3138733512'
    bdtb = BDTB(baseURL,2)
    bdtb.getContent(bdtb.getPage(2))
  • 相关阅读:
    在 XD 和 Axure 中使用 iconfont
    chartjs 曲线图 纪要
    js ajax 等的的一些纪要
    程序员的方向
    sqlserver 常用的练习50例子
    (function(){})()原理
    layer弹出层详解
    sqlserver 表值函数 保存使用
    关于批量下载线程池与信号机制的使用
    tp5.1 phpstudy composer 配置等
  • 原文地址:https://www.cnblogs.com/norm/p/7426279.html
Copyright © 2011-2022 走看看