zoukankan      html  css  js  c++  java
  • 简单的百度贴吧爬虫实现(urllib)

    环境:ubuntu 16.04 LTS   (X86-64),pycharm

    python版本 :3.5.1+

    #生成的文件默认会保存到代码所在根目录

    1
    import urllib.request,urllib.error,re 2 3 class Tool: 4 removeImg=re.compile('<img.*?| {7}|') 5 removeAddr=re.compile('<a.*?|</a>') 6 replaceLine=re.compile('<tr>|<div>|</div>|</p>') 7 replaceTD=re.compile('<td>') 8 replaceBR=re.compile('<br></br>|br') 9 replaceExtra=re.compile('<.*?>') 10 def replace(self,x): 11 x=re.sub(self.removeImg,"",x) 12 x=re.sub(self.removeAddr,"",x) 13 x=re.sub(self.replaceLine," ",x) 14 x=re.sub(self.replaceTD," ",x) 15 x=re.sub(self.replaceBR," ",x) 16 x=re.sub(self.replaceExtra,"",x) 17 return x.strip() 18 20 class BDTB: 21 def __init__(self,baseUrl,see_lz): 22 self.tool=Tool() 23 self.baseurl=baseUrl+'?see_lz='+str(see_lz)+'&pn=' 24 self.defaultTitle=u'百度贴吧' 25 26 def getPage(self,pagenum): 27 try: 28 url=self.baseurl+str(pagenum) 29 request=urllib.request.Request(url) 30 response=urllib.request.urlopen(request) 31 content = response.read().decode('utf-8') 32 return content 33 except urllib.error.URLError as e: 34 if hasattr(e,"reason"): 35 print(u'connect error reason:'+e.reason) 36 if hasattr(e,'code'): 37 print(u'connect error,reason:'+e.code) 38 39 def getPns(self,content): 40 pattern = re.compile('<li class="l_reply_num".*?<span class="red">(.*?)</span>', re.S) 41 pns = int((re.findall(pattern, content))[0]) 42 return pns 43 44 def getTitle(self,content): 45 pattern=re.compile('<h3 class="core_title_txt pull-left text-overflow ".*?>(.*?)</h3>',re.S) 46 return str((re.findall(pattern,content))[0]) 47 48 def getContent(self,content): 49 pattern=re.compile('<ul class="p_author".*?<li class="d_name".*?target="_blank">(.*?)</a>.*?<div id="post_content_.*?>(.*?)</div>',re.S) 50 items=re.findall(pattern,content) 51 contents=[] 52 for item in items: 53 content='Username: '+item[0]+' content: '+self.tool.replace(item[1])+' ' 54 contents.append(content) 55 return contents 56 57 def setFileTitle(self,Title): 58 if Title is not None: 59 self.file=open(Title+'.txt','w+') 60 else: 61 self.file=open(self.defaultTitle+'.txt','w+') 62 63 def WriteData(self,contents): 64 for content in contents: 65 self.file.write(content) 66 67 def start(self): 68 Pns=self.getPns(self.getPage(1)) 69 self.setFileTitle(self.getTitle(self.getPage(1))) 70 for i in range(Pns): 71 print('Page Sum:'+str(Pns)+' ') 72 print('Now is Write page:'+str(i)+' ') 73 self.WriteData(self.getContent(self.getPage(i))) 74 self.file.close() 75 76 print('please enter discussion num:') 77 url='http://tieba.baidu.com/p/'+str(input()) 78 see_lz=input('Whether just see lz(enter 0 or 1)') 79 bdtb=BDTB(url,see_lz) 80 bdtb.start()

    运行结果:

  • 相关阅读:
    正式定居博客圆,发些以前在Topcoder上的练习题,对算法和STL有兴趣的朋友可以看下:)
    TopCoder真题讲解之二
    “命名空间“System”中不存在类型或命名空间名称“Linq”(是缺少程序集引用吗?)”
    短信发送
    JavaScript打印和预览等
    .net获取IP地址的几种方法转载
    WinForm中控件与背景透明
    用C#实现C/S模式下软件自动在线升级转
    Microsoft Access 时间函数汇总
    .net 发送Email 单发 群发
  • 原文地址:https://www.cnblogs.com/INnoVationv2/p/5679849.html
Copyright © 2011-2022 走看看