初学爬虫,做的一个爬取糗事百科例子 python+beautiful soup
爬取糗事百科热门并把有图片的过滤掉
# coding:utf-8 import urllib2 import urllib import sys from bs4 import BeautifulSoup reload(sys) sys.setdefaultencoding( "utf-8" ) page=range(1,36) for i in range(0,len(page)): print "now page is " + str(i+1) url = 'http://www.qiushibaike.com/hot/page/'+ str(i+1) print url user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = { 'User-Agent' : user_agent } req = urllib2.Request(url,headers=headers) try: response = urllib2.urlopen(req) mypage = response.read().decode('utf8') #解析网页 soup =BeautifulSoup(mypage,"html.parser") items = soup.find_all('div',class_='article block untagged mb15') for i in range(0,len(items)): if items[i].find('div',class_='thumb'): continue else: print str(items[i].find('div',class_='content').text) except urllib2.URLError,e: if hasattr(e,"code"): print e.code if hasattr(e,"reason"): print e.reason