为了抓豆瓣小组的图片写的一个爬虫...大家懂的...
python用的是3.5.2
根据网上gdp12315的版本改出来的, 吧想抓的小组编号添加进url_list就行了
随时能停止, 增加了很多防止重复抓取下载的判断, 坏处是不能判断更新的主题(更新的很少...忽略掉了)
建议多增加点user_agents, 能有效的防止403
如果有什么修改或者好的建议, 请联系我 lzl_17948876@hotmail.com
# -*- coding: utf-8 -*- # ----------------------------------------------- # 程序:豆瓣小组图片爬虫 # 版本:1.2.2 # 语言:Python 3.5.2 # 作者:刘志林 # # 感谢: 程序修改自gdp12315的1.0版本 http://blog.csdn.net/gdp12315_gu/article/details/47323613 # # 2016-11-07 # 修改已处理连接的记录方式, 每个小组一个信息记录 # 2016-11-08 # 修改时间记录位置不对的问题, 改为开始处理前记录 # 2016-11-28 # 增加记录总循环次数 # 增量获取时增加一个判断: 如果最后一个页面仍然有未获取过的主题, 则再获取下一个页面, 直到某个页面主题全部是已获取为止 # ----------------------------------------------- import random import socket, http.cookies, http.cookiejar import urllib.request import re import os, sys import datetime, time import pickle class UrlInfo(object): __filename = '' dic_topic = {} lastdt = '' def __init__(self, a_filename): self.__filename = a_filename self.dic_topic = {} self.lastdt = '' def load(self): if os.path.exists(self.__filename): f = open(self.__filename, 'rb') try: tmp = pickle.load(f) finally: f.close() self.__dict__.update(tmp) def save(self): f = open(self.__filename, 'wb') try: pickle.dump(self.__dict__, f) finally: f.close() class BrowserBase(object): def __init__(self): socket.setdefaulttimeout(20) def speak(self,name,content): print('[%s]%s', name,content) def openurl(self,url): #预制一堆user_agents 防止403 user_agents = [ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11', ] try: cookie_support= urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()) self.opener = urllib.request.build_opener(cookie_support,urllib.request.HTTPHandler) urllib.request.install_opener(self.opener) self.opener.addheaders = [ ('Host','www.douban.com'), ('Connection', 'keep-alive'), ('Accept', '*/*'), ('User-Agent', random.choice(user_agents)), ('Referer','http://www.google.com'), ] res = self.opener.open(url) #print(res.read()) except Exception as e: self.speak(str(e),url) raise Exception else: return res finally: time.sleep(1) if __name__=='__main__': splider=BrowserBase() #要处理的小组列表, 第一个值是小组Code -> https://www.douban.com/group/小组Code/discussion?start= url_list = [ ('tomorrow', '灵异豆瓣'), ('439803', '出差男女'), ] #记录处理过的主题 workpath = os.getcwd() + '\' loopCount = 0 while True: for url_rec in url_list: print(' -------- (L-%d) %s %s 开始采集 --------'%(loopCount + 1, datetime.datetime.now().strftime('%Y-%m-%d %X'), url_rec[1])) #创建目录 filepath = '%sPictures\%s\'%(workpath, url_rec[1]) if not os.path.exists(filepath): os.makedirs(filepath) url = 'https://www.douban.com/group/%s/discussion?start='%(url_rec[0]) try: html_topic_list = splider.openurl(url).read().decode('utf-8') except: continue #加载信息数据 info = UrlInfo('%sPictures\%s.info'%(workpath, url_rec[1])) info.load() #最后一次处理时间, 如果是空的就处理全部记录 if info.lastdt == '': print('第一次处理') dt_last = None else: print('上次处理完毕时间: %s'%(info.lastdt)) dt_last = datetime.datetime.strptime(info.lastdt, '%Y-%m-%d %X') page_max = int(re.compile(r'd+').findall(re.compile(r'data-total-page="d+"').findall(html_topic_list)[0])[0]) if dt_last == None: page_end = page_max num_end = (page_end - 1) * 25 else: t2 = (datetime.datetime.now() - dt_last) num_end = t2.days * 24 * 6 + t2.seconds //300 #假设每5分钟会出现一篇新主题 page_end = num_end // 25 + 1 #记录当前处理时间 _lastdt = datetime.datetime.now().strftime('%Y-%m-%d %X') num_begin = 0 page_begin = 1 while num_begin <= num_end: try: nFullTopicExists = True html_topic_list = splider.openurl(url+str(num_begin)).read().decode('utf-8') #获得主题列表 topic_list = re.compile(r'https://www.douban.com/group/topic/d+/').findall(html_topic_list) topic_count = len(topic_list) print('%s page: %d/%d - %d'%(url_rec[1], page_begin, page_end, topic_count)) for topic_url_index in range(topic_count): topic_url = topic_list[topic_url_index] #print('topic_url '+topic_url) #不再处理已经处理过的主题 topic_code = re.findall(r'd+', topic_url)[0] if topic_code in info.dic_topic: print('#%d '%(topic_url_index + 1), end='') continue else: nFullTopicExists = False print('%d '%(topic_url_index + 1), end='') try: html_topic = splider.openurl(topic_url).read().decode('utf-8') except: continue #记录主题已经处理过 info.dic_topic[topic_code] = '' info.save() #获得图片下载地址列表 img_list = re.compile(r'https://imgd.doubanio.com/view/group_topic/large/public/pd+.jpg').findall(html_topic) #遍历图片下载地址并保存 for img_url in img_list: #print('img_url: '+img_url) filename = '%s\%s-%s.jpg'%(filepath, topic_code, re.findall(r'pd+',img_url)[0]) if not os.path.exists(filename): try: #print(filename) download_img = urllib.request.urlretrieve(img_url, filename) except Exception as e: print(e) continue finally: time.sleep(2) #waittime = random.randint(10,15) #print('wait %d'%waittime) #time.sleep(waittime) num_begin = num_begin + 25 if (dt_last != None) and (num_begin > num_end) and (not nFullTopicExists): num_end = num_end + 25 except Exception as e: print(e) continue finally: page_begin = page_begin + 1 print() info.lastdt = _lastdt info.save() print('-------- %s %s 采集完成 -------- '%(datetime.datetime.now().strftime('%Y-%m-%d %X'), url_rec[1])) loopCount = loopCount + 1