zoukankan      html  css  js  c++  java
  • python爬取豆瓣里面活动照片的工厂设计模式

    #!/usr/bin/python
    # coding: utf-8
    
    #
    # filename: pachong of doubban
    # 
    # author: Hacker_MJW
    #
    # Date: 2014-02-28
    #
    
    
    import urllib
    import urllib2
    
    import logging
    import os
    
    import re
    
    import time
    
    
    
    class SuperReptile(object):
        '''
        爬虫的超级类,用作继承用
        '''
        def __init__(self, 
          init_url, pro_name):
            self.init_url = init_url
            self.pro_name = pro_name
            #logging.basicConfig()
    
            self.log = logging.getLogger(self.pro_name)
            self.log.setLevel(logging.DEBUG)
            #print self.pro_name+'.txt'
    
            self.handler = logging.FileHandler(filename=self.pro_name+'.txt')
            self.handler.setLevel(logging.DEBUG)
    
            self.formatter = logging.Formatter('%(asctime)s-%(name)s-%(levelname)s-%(message)s')
            self.handler.setFormatter(self.formatter)
    
            self.log.addHandler(self.handler)
    
        def open_url(self, url):
            req = urllib2.Request(url)
            try:
                rps = urllib2.urlopen(req)
                self.page = rps.read()
                s = 'open %s successfully' % url
                 self.log.info(s)
            except urllib2.HTTPError as e:
                self.log.error('httperror %d' % e.code)
                time.sleep(100)
            except:
                self.log.info('unknown error')
            self.handler.flush()
            return self.page
    
        def download(self):
            pass
    
        def close_log(self):
            self.log.removeHandler(self.handler)
            self.handler.flush()
            self.handler.close()
    
    
    class DoubanReptile(SuperReptile):
        '''
        superclass: __init__(self, init_url, pro_name)
        爬取豆瓣活动图片的爬虫
        '''
        def __init__(self, url):
            super(DoubanReptile, self).__init__(url, 'DoubanReptile')
            self.start_page = super(DoubanReptile, self).open_url(url).replace('
    ', '')
    
            self.p_img = '<divs*class="photo_wrap"s*>(.*?)</a>'
            self.p_img_href = '<as*href="(.*?)"'
            self.p_img_img = '<imgs*src="(.*?)"s*/>'
            self.p_urls_a = '<spans*class="thispage".*?>(.*?)<spans*class="next">'
            self.p_urls_b = '<as*href="(.*?)"s*>'
            self.img_list = []
    
            if not os.path.exists(os.getcwd()+'\doubanphotos'):
                try:
                    os.mkdir(os.getcwd()+'\doubanphotos')
                except:
                    pass
            else:
                self.path = os.getcwd() + '\doubanphotos'
    
        def compile_p(self):
            #
            #compile re pattern
            #
            self.img_p = re.compile(self.p_img)
            self.img_href = re.compile(self.p_img_href)
            self.img_img = re.compile(self.p_img_img)
            self.url_a = re.compile(self.p_urls_a)
            self.url_b = re.compile(self.p_urls_b)
    
        def get_urls(self):
            #
            #get a numbers of url of the item
            #
            div = self.url_a.findall(self.start_page)
            self.url_list = self.url_b.findall(div[0])
    
        def get_img_src(self):
            #
            #get a numbers of sources of the item
            #
            page_list = []
            page_list.append(self.start_page)
            for url in self.url_list:
                page = super(DoubanReptile, self).open_url(url).replace('
    ', '')
                page_list.append(page)
    
            for page in page_list:
                img_divs = self.img_p.findall(page)
                name = 'douban'
                for div in img_divs:
                    img_href = self.img_href.findall(div)
                    page_big = super(DoubanReptile, self).open_url(img_href[0]).replace('
    ', '')
                    img_src = self.img_img.findall(page_big)[0]
                    print img_src
                    self.img_list.append(img_src)
    
        def download(self):
            #
            #download the src
            #
            for src in self.img_list:
                print src
                urllib.urlretrieve(src, os.path.join(self.path,src.split('/')[-1]))
    
    
    class ReptileFactory(object):
        '''
        爬虫工厂
        '''
        def __init__(self, reptile):
            self.value = reptile
            self.reptile_dict = {'douban':DoubanReptile}
    
        def get_reptie(self):
            return self.reptile_dict[self.value]
    
    
    
    if __name__ == '__main__':
        factory = ReptileFactory('douban')
        douban = factory.get_reptie()('http://www.douban.com/online/11698467/album/125554615/')
        douban.compile_p()
        douban.get_urls()
        douban.get_img_src()
        douban.download()
        douban.close_log()
        

    今天学了一下python强大的logging模块,于是就想着用一下,然后我就想到了爬虫,好多时候我们需要记录一下其中过程,日志是一个非常好的方法,虽然写文件也可以完成,但是却没有这个日志强大。

  • 相关阅读:
    ECharts之柱状图 饼状图 折线图
    Vue自定义指令(directive)
    HDU 1231 最大连续子序列
    POJ 2533 Longest Ordered Subsequence
    HDU 1163 Eddy's digital Roots
    HDU 2317 Nasty Hacks
    HDU 2571 命运
    HDU 4224 Enumeration?
    HDU 1257 最少拦截系统
    HDU 2740 Root of the Problem
  • 原文地址:https://www.cnblogs.com/MyselfDancing/p/3574259.html
Copyright © 2011-2022 走看看