利用bs库进行爬取,在下载html时,使用代理user_agent来下载,并且下载次数是2次,当第一次下载失败后,并且http状态码是500-600之间,然后会重新下载一次
soup = BeautifulSoup(html, "html.parser")
当前页面时html的
当当前页面时html5时
soup = BeautifulSoup(html, "html5lib")
#-*- coding:utf-8 -*- import re import urllib import urllib2 import lxml.html import itertools import os from bs4 import BeautifulSoup def download(url,user_agent='wswp',num_try = 2): print 'Downloading:',url headers = {'User_agent':user_agent} request = urllib2.Request(url,headers=headers) try: html = urllib2.urlopen(request).read() except urllib2.URLError as e: print 'Download error',e.reason html = None if num_try > 0: if hasattr(e,'code') and 500 <= e.code <600: return download(url,user_agent,num_try-1) return html def download_picture(url,path,name): if not os.path.isdir(path): os.mkdir(path) f = open(path+'/' + name + '.jpg', 'wb') f.write(download(url)) f.close() def bs_scraper(html): soup = BeautifulSoup(html, "html.parser") results = soup.find_all(name='img',attrs={'class':'BDE_Image'}) tt = 0 for each in results: src = each.get('src') print src download_picture(src,'/picture',str(tt)) tt = tt + 1 url = 'https://tieba.baidu.com/p/4693368072' html = download(url) bs_scraper(html)