zoukankan      html  css  js  c++  java
  • 豆瓣小组爬虫.....^_^

    为了抓豆瓣小组的图片写的一个爬虫...大家懂的...

    python用的是3.5.2

    根据网上gdp12315的版本改出来的, 吧想抓的小组编号添加进url_list就行了

    随时能停止, 增加了很多防止重复抓取下载的判断, 坏处是不能判断更新的主题(更新的很少...忽略掉了)

    建议多增加点user_agents, 能有效的防止403

    如果有什么修改或者好的建议, 请联系我 lzl_17948876@hotmail.com

    # -*- coding: utf-8 -*-
    # -----------------------------------------------
    #   程序:豆瓣小组图片爬虫
    #   版本:1.2.2
    #   语言:Python 3.5.2
    #   作者:刘志林
    #
    #   感谢: 程序修改自gdp12315的1.0版本  http://blog.csdn.net/gdp12315_gu/article/details/47323613
    #
    #   2016-11-07
    #       修改已处理连接的记录方式, 每个小组一个信息记录
    #   2016-11-08
    #       修改时间记录位置不对的问题, 改为开始处理前记录
    #   2016-11-28
    #       增加记录总循环次数
    #       增量获取时增加一个判断: 如果最后一个页面仍然有未获取过的主题, 则再获取下一个页面, 直到某个页面主题全部是已获取为止
    # -----------------------------------------------
    
        
    import random
    import socket, http.cookies, http.cookiejar
    import urllib.request
    import re
    import os, sys
    import datetime, time
    import pickle
    
    class UrlInfo(object):
        __filename = ''
        dic_topic = {}
        lastdt = ''
        
        def __init__(self, a_filename):
            self.__filename = a_filename
            self.dic_topic = {}
            self.lastdt = ''
    
        def load(self):
            if os.path.exists(self.__filename):
                f = open(self.__filename, 'rb')
                try:
                    tmp = pickle.load(f)
                finally:
                    f.close()
                self.__dict__.update(tmp)
    
        def save(self):
            f = open(self.__filename, 'wb')
            try:
                pickle.dump(self.__dict__, f)
            finally:
                f.close()
            
    
    class BrowserBase(object): 
    
        def __init__(self):
            socket.setdefaulttimeout(20)
    
        def speak(self,name,content):
            print('[%s]%s', name,content)
    
        def openurl(self,url):
            #预制一堆user_agents 防止403
            user_agents = [
                        'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
                        'Opera/9.25 (Windows NT 5.1; U; en)',
                        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
                        'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
                        ]
    
            try:
                cookie_support= urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar())
                self.opener = urllib.request.build_opener(cookie_support,urllib.request.HTTPHandler)
                urllib.request.install_opener(self.opener)
                self.opener.addheaders = [
                    ('Host','www.douban.com'),
                    ('Connection', 'keep-alive'),
                    ('Accept', '*/*'),
                    ('User-Agent', random.choice(user_agents)),
                    ('Referer','http://www.google.com'),
                    ]
                
                res = self.opener.open(url)
                #print(res.read())
            except Exception as e:
                self.speak(str(e),url)
                raise Exception
            else:
                return res
            finally:
                time.sleep(1)
    
    
    if __name__=='__main__':
        splider=BrowserBase()
            
        
    #要处理的小组列表, 第一个值是小组Code -> https://www.douban.com/group/小组Code/discussion?start=
    url_list = [
        ('tomorrow', '灵异豆瓣'),
        ('439803', '出差男女'),
        ]
    
    #记录处理过的主题
    
    workpath = os.getcwd() + '\'
    
    loopCount = 0
    
    while True:
        for url_rec in url_list:
            print('
    -------- (L-%d) %s  %s 开始采集 --------'%(loopCount + 1, datetime.datetime.now().strftime('%Y-%m-%d %X'), url_rec[1]))
            #创建目录
            filepath = '%sPictures\%s\'%(workpath, url_rec[1])
            if not os.path.exists(filepath):
                os.makedirs(filepath)
    
            url = 'https://www.douban.com/group/%s/discussion?start='%(url_rec[0])
            try:
                html_topic_list = splider.openurl(url).read().decode('utf-8')
            except:
                continue
    
            #加载信息数据
            info = UrlInfo('%sPictures\%s.info'%(workpath, url_rec[1]))
            info.load()
    
            #最后一次处理时间, 如果是空的就处理全部记录
            if info.lastdt == '':
                print('第一次处理')
                dt_last = None
            else:
                print('上次处理完毕时间: %s'%(info.lastdt))
                dt_last = datetime.datetime.strptime(info.lastdt, '%Y-%m-%d %X')
    
            page_max = int(re.compile(r'd+').findall(re.compile(r'data-total-page="d+"').findall(html_topic_list)[0])[0])
            if dt_last == None:
                page_end = page_max
                num_end = (page_end - 1) * 25
            else:
                t2 = (datetime.datetime.now() - dt_last)
                num_end = t2.days * 24 * 6 + t2.seconds //300 #假设每5分钟会出现一篇新主题
                page_end = num_end // 25 + 1
    
            #记录当前处理时间
            _lastdt = datetime.datetime.now().strftime('%Y-%m-%d %X')
    
            num_begin = 0
            page_begin = 1
            while num_begin <= num_end:
                try:
                    nFullTopicExists = True
                    html_topic_list = splider.openurl(url+str(num_begin)).read().decode('utf-8')
                    #获得主题列表
                    topic_list = re.compile(r'https://www.douban.com/group/topic/d+/').findall(html_topic_list)
                    topic_count = len(topic_list)
                    print('%s page: %d/%d - %d'%(url_rec[1], page_begin, page_end, topic_count))
    
                    for topic_url_index in range(topic_count):
                        topic_url = topic_list[topic_url_index]
                        #print('topic_url '+topic_url)
                        
                        #不再处理已经处理过的主题
                        topic_code = re.findall(r'd+', topic_url)[0]
                        if topic_code in info.dic_topic:
                            print('#%d '%(topic_url_index + 1), end='')
                            continue
                        else:
                            nFullTopicExists = False
                            print('%d '%(topic_url_index + 1), end='')
                        
                        try:
                            html_topic = splider.openurl(topic_url).read().decode('utf-8')
                        except:
                            continue
    
                        #记录主题已经处理过
                        info.dic_topic[topic_code] = ''
                        info.save()
                            
                        #获得图片下载地址列表
                        img_list = re.compile(r'https://imgd.doubanio.com/view/group_topic/large/public/pd+.jpg').findall(html_topic)
                        
                        #遍历图片下载地址并保存
                        for img_url in img_list:
                            #print('img_url: '+img_url)
                            filename = '%s\%s-%s.jpg'%(filepath, topic_code, re.findall(r'pd+',img_url)[0])
                            if not os.path.exists(filename):
                                try:
                                    #print(filename)
                                    download_img = urllib.request.urlretrieve(img_url, filename)
                                except Exception as e:
                                    print(e)
                                    continue
                                finally:
                                    time.sleep(2)
                         #waittime = random.randint(10,15)
                         #print('wait %d'%waittime)
                         #time.sleep(waittime)
                    num_begin = num_begin + 25
                    if (dt_last != None) and (num_begin > num_end) and (not nFullTopicExists):
                        num_end = num_end + 25
                except Exception as e:
                    print(e)
                    continue
                finally:
                    page_begin = page_begin + 1
                print()
    
            info.lastdt = _lastdt
            info.save()
            print('-------- %s  %s 采集完成 --------
    '%(datetime.datetime.now().strftime('%Y-%m-%d %X'), url_rec[1]))
        loopCount = loopCount + 1
  • 相关阅读:
    结对编程队友个人项目分析
    Android入门3:从Toolbar到Material Design
    Qt串口通信
    AVT Vimba与OpenCV环境配置
    Git远程使用技巧
    Android入门2:从GridView控件使用到自定义Adapter
    Android入门1:使用VideoView和MediController播放视频
    kafka+spark streaming+redis学习
    kafka学习笔记
    安卓获取服务器返回的图片资源路径并下载图片
  • 原文地址:https://www.cnblogs.com/lzl_17948876/p/6031001.html
Copyright © 2011-2022 走看看