zoukankan      html  css  js  c++  java
  • python百度贴吧爬虫

    # -*- coding: utf-8 -*-
    #coding=utf-8
    
    import urllib
    import urllib2
    import re
    import thread
    import time
    
    class BDTB:
        def __init__(self,baseurl,seeLz):
            self.baseUrl=baseurl
            self.seeLz='?see_lz='+str(seeLz)
            self.Tool=Tool()
        def getPage(self,pageNum):
            try:
                url=self.baseUrl+self.seeLz+'&pn='+str(pageNum)
                request = urllib2.Request(url)
                response = urllib2.urlopen(request)
                return response.read()
            except urllib2.URLError, e:
                print "链接网络失败"+e.reason
                return None
        def getTitle(self):
            html=self.getPage(1)
            pattern = re.compile('core_title_txt pull-left text-overflow.*?>(.*?)</h3>',re.S)
            result =re.search(pattern,html)
            if result:
                print result.group(1)
            else:
                return None
        def getContent(self,page):
            pattern  =re.compile('<div id="post_content_.*?>(.*?)</div>',re.S)
            items = re.findall(pattern,page)
            floor=1
            for i in items:
                print floor,u'楼--------------------------------------------
    '
                print self.Tool.replace(i)
                floor+=1
    
    
    
    
    class Tool:
        #去除img标签,1-7位空格,&nbsp;
        removeImg = re.compile('<img.*?>| {1,7}|&nbsp;')
        #删除超链接标签
        removeAddr = re.compile('<a.*?>|</a>')
        #把换行的标签换为
        replaceLine = re.compile('<tr>|<div>|</div>|</p>')
        #将表格制表<td>替换为
        replaceTD= re.compile('<td>')
        #将换行符或双换行符替换为
        replaceBR = re.compile('<br><br>|<br>')
        #将其余标签剔除
        removeExtraTag = re.compile('<.*?>')
        #将多行空行删除
        removeNoneLine = re.compile(' +')
    
    
        def replace(self,x):
    
    
            x = re.sub(self.removeImg,"",x)
    
    
            x = re.sub(self.removeAddr,"",x)
    
    
            x = re.sub(self.replaceLine," ",x)
    
    
            x = re.sub(self.replaceTD," ",x)
    
    
            x = re.sub(self.replaceBR," ",x)
    
    
            x = re.sub(self.removeExtraTag,"",x)
    
    
            x = re.sub(self.removeNoneLine," ",x)
    
    
            #strip()将前后多余内容删除
    
    
            return x.strip()
    
    
    
    
    baseURL = 'http://tieba.baidu.com/p/3138733512'
    bdtb = BDTB(baseURL,2)
    bdtb.getContent(bdtb.getPage(2))
  • 相关阅读:
    计算系数
    P2734 [USACO3.3]游戏 A Game——区间dp+博弈论
    4.14作业
    安装MySQL数据库,建立用户表 uid uname upwd 并插入3条数据 2.制作jsp登录页面 index.jsp 提交到ok.jsp,使用jdbc连数据库,判断输入的用户名密码是否存在 3.如果存在,把用户名保存,跳转到yes.jsp
    jsp 3.10作业
    软件测试第一次课堂练习3.4
    easysync 协同算法详解
    支付宝订阅消息推送
    Linux防火墙操作指令
    Windows端口号操作
  • 原文地址:https://www.cnblogs.com/norm/p/7426279.html
Copyright © 2011-2022 走看看