zoukankan      html  css  js  c++  java
  • 帖子爬取

     1 # -*- coding: utf-8 -*-  
     2 import string  
     3 import urllib2  
     4 import re
     5 from BeautifulSoup import BeautifulSoup
     6 
     7 class Baidu_Spider:  
     8     def __init__(self,url):
     9         self.myUrl=url;
    10          
    11     # 初始化加载页面并将其转码储存  
    12     def baidu_tieba(self,page):
    13         self.myUrl=self.myUrl+str(page)
    14         print self.myUrl
    15         # 读取页面的原始信息并将其从gbk转码  
    16         myPage = urllib2.urlopen(self.myUrl).read().decode("gbk")
    17         soup = BeautifulSoup(myPage)
    18         thread_list = soup.findAll("div",attrs={"class":'t_con clearfix'})
    19         for record in thread_list:
    20             #print record
    21             author = self.find_author(record)
    22             #print author
    23             hot = self.find_hot(record)
    24             #print hot
    25             title = self.find_title(record)
    26             #print title
    27             content = self.find_content(record)
    28             #print content
    29             url = self.find_url(record)
    30             #print url
    31             if url !="":
    32                 self.save_data(url,title,content,author,hot)
    33     
    34     # 用来寻找该帖的相关信息 
    35     def find_url(self,record):
    36         ahref=record.findAll("a",attrs={"class":'j_th_tit'})
    37         if len(ahref)>0:
    38             url='http://tieba.baidu.com'+ahref[0]['href'] 
    39         else:
    40             url=""
    41 
    42         return url 
    43     def find_author(self,record):
    44         author1=record.find("span",attrs={"class":'tb_icon_author '})
    45         author2=author1.find("a",attrs={"class":'j_user_card'})
    46         if author2 is not None:
    47             author=author2.next
    48         else:
    49             author=author1.text
    50         return author
    51     def find_hot(self,record):
    52         hot=record.find("div",attrs={"class":'threadlist_rep_num'}).text
    53         return hot
    54     def find_title(self,record):
    55         tt=record.findAll("a",attrs={"class":'j_th_tit'})
    56         if len(tt)>0:
    57             title=tt[0]['title']
    58         else:
    59             title=''
    60         return title 
    61     def find_content(self,record):
    62         content=record.find("div",attrs={"class":'threadlist_abs threadlist_abs_onlyline'})
    63         if content is not None:
    64             content=content.next.text
    65         else:
    66             content=''
    67         return content
    68     
    69        
    70     # 用来存储楼主发布的内容  
    71     def save_data(self,url,title,content,author,hot):  
    72         data=url+"07"+hot+"07"+author+"07"+title+"07"+content+"
    "
    73         data=data.encode('utf-8')
    74         #print data
    75         f = open('spider'+'.txt','a')  
    76         f.write(data)  
    77         f.close()
    78 
    79 #-------- 程序入口处 ------------------  
    80 page=0
    81 print u'已经启动百度贴吧爬虫'
    82 while True:
    83     bdurl ='http://tieba.baidu.com/f?kw=%C9%CF%BA%A3%BD%BB%CD%A8%B4%F3%D1%A7&tp=0&pn='
    84     mySpider = Baidu_Spider(bdurl)
    85     mySpider.baidu_tieba(page)
    86     page=page+50
    View Code
  • 相关阅读:
    变量,基本数据类型
    编程语言分类,Python介绍、IDE集成开发环境,注释
    Django之Cookie,Session
    第三章
    第二章
    第一章
    php面向对象(文件操作)
    php面向对象(目录操作)
    php有关类和对象的相关知识2
    php有关类和对象的相关知识1
  • 原文地址:https://www.cnblogs.com/liutoutou/p/3492634.html
Copyright © 2011-2022 走看看