zoukankan      html  css  js  c++  java
  • python爬取豆瓣里面活动照片的工厂设计模式

    #!/usr/bin/python
    # coding: utf-8
    
    #
    # filename: pachong of doubban
    # 
    # author: Hacker_MJW
    #
    # Date: 2014-02-28
    #
    
    
    import urllib
    import urllib2
    
    import logging
    import os
    
    import re
    
    import time
    
    
    
    class SuperReptile(object):
        '''
        爬虫的超级类,用作继承用
        '''
        def __init__(self, 
          init_url, pro_name):
            self.init_url = init_url
            self.pro_name = pro_name
            #logging.basicConfig()
    
            self.log = logging.getLogger(self.pro_name)
            self.log.setLevel(logging.DEBUG)
            #print self.pro_name+'.txt'
    
            self.handler = logging.FileHandler(filename=self.pro_name+'.txt')
            self.handler.setLevel(logging.DEBUG)
    
            self.formatter = logging.Formatter('%(asctime)s-%(name)s-%(levelname)s-%(message)s')
            self.handler.setFormatter(self.formatter)
    
            self.log.addHandler(self.handler)
    
        def open_url(self, url):
            req = urllib2.Request(url)
            try:
                rps = urllib2.urlopen(req)
                self.page = rps.read()
                s = 'open %s successfully' % url
                 self.log.info(s)
            except urllib2.HTTPError as e:
                self.log.error('httperror %d' % e.code)
                time.sleep(100)
            except:
                self.log.info('unknown error')
            self.handler.flush()
            return self.page
    
        def download(self):
            pass
    
        def close_log(self):
            self.log.removeHandler(self.handler)
            self.handler.flush()
            self.handler.close()
    
    
    class DoubanReptile(SuperReptile):
        '''
        superclass: __init__(self, init_url, pro_name)
        爬取豆瓣活动图片的爬虫
        '''
        def __init__(self, url):
            super(DoubanReptile, self).__init__(url, 'DoubanReptile')
            self.start_page = super(DoubanReptile, self).open_url(url).replace('
    ', '')
    
            self.p_img = '<divs*class="photo_wrap"s*>(.*?)</a>'
            self.p_img_href = '<as*href="(.*?)"'
            self.p_img_img = '<imgs*src="(.*?)"s*/>'
            self.p_urls_a = '<spans*class="thispage".*?>(.*?)<spans*class="next">'
            self.p_urls_b = '<as*href="(.*?)"s*>'
            self.img_list = []
    
            if not os.path.exists(os.getcwd()+'\doubanphotos'):
                try:
                    os.mkdir(os.getcwd()+'\doubanphotos')
                except:
                    pass
            else:
                self.path = os.getcwd() + '\doubanphotos'
    
        def compile_p(self):
            #
            #compile re pattern
            #
            self.img_p = re.compile(self.p_img)
            self.img_href = re.compile(self.p_img_href)
            self.img_img = re.compile(self.p_img_img)
            self.url_a = re.compile(self.p_urls_a)
            self.url_b = re.compile(self.p_urls_b)
    
        def get_urls(self):
            #
            #get a numbers of url of the item
            #
            div = self.url_a.findall(self.start_page)
            self.url_list = self.url_b.findall(div[0])
    
        def get_img_src(self):
            #
            #get a numbers of sources of the item
            #
            page_list = []
            page_list.append(self.start_page)
            for url in self.url_list:
                page = super(DoubanReptile, self).open_url(url).replace('
    ', '')
                page_list.append(page)
    
            for page in page_list:
                img_divs = self.img_p.findall(page)
                name = 'douban'
                for div in img_divs:
                    img_href = self.img_href.findall(div)
                    page_big = super(DoubanReptile, self).open_url(img_href[0]).replace('
    ', '')
                    img_src = self.img_img.findall(page_big)[0]
                    print img_src
                    self.img_list.append(img_src)
    
        def download(self):
            #
            #download the src
            #
            for src in self.img_list:
                print src
                urllib.urlretrieve(src, os.path.join(self.path,src.split('/')[-1]))
    
    
    class ReptileFactory(object):
        '''
        爬虫工厂
        '''
        def __init__(self, reptile):
            self.value = reptile
            self.reptile_dict = {'douban':DoubanReptile}
    
        def get_reptie(self):
            return self.reptile_dict[self.value]
    
    
    
    if __name__ == '__main__':
        factory = ReptileFactory('douban')
        douban = factory.get_reptie()('http://www.douban.com/online/11698467/album/125554615/')
        douban.compile_p()
        douban.get_urls()
        douban.get_img_src()
        douban.download()
        douban.close_log()
        

    今天学了一下python强大的logging模块,于是就想着用一下,然后我就想到了爬虫,好多时候我们需要记录一下其中过程,日志是一个非常好的方法,虽然写文件也可以完成,但是却没有这个日志强大。

  • 相关阅读:
    SQL多表关联原理研究实验验证
    SQL多表关联原理研究实验验证
    vs2015如何设置不显示类或函数前的引用数量
    vs2015如何设置不显示类或函数前的引用数量
    Visual Studio 中突出显示的引用
    Visual Studio 中突出显示的引用
    GridControl标题及单元格内容居中显示
    GridControl标题及单元格内容居中显示
    DevExpress的GridControl控件设置自定义显示方法
    DevExpress的GridControl控件设置自定义显示方法
  • 原文地址:https://www.cnblogs.com/MyselfDancing/p/3574259.html
Copyright © 2011-2022 走看看